<a href="https://colab.research.google.com/github/pnbayar/PRAVEEN/blob/master/Colab/StyleTTS2_Demo_LJSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install packages and download models

In [1]:
%%shell
git clone https://github.com/yl4579/StyleTTS2.git
cd StyleTTS2
pip install SoundFile torchaudio munch torch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git
sudo apt-get install espeak-ng
git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LJSpeech
mv StyleTTS2-LJSpeech/Models .

Cloning into 'StyleTTS2'...
remote: Enumerating objects: 372, done.[K
remote: Total 372 (delta 0), reused 0 (delta 0), pack-reused 372 (from 1)[K
Receiving objects: 100% (372/372), 133.98 MiB | 14.93 MiB/s, done.
Resolving deltas: 100% (199/199), done.
Updating files: 100% (48/48), done.
Collecting git+https://github.com/resemble-ai/monotonic_align.git
  Cloning https://github.com/resemble-ai/monotonic_align.git to /tmp/pip-req-build-wos3w0c1
  Running command git clone --filter=blob:none --quiet https://github.com/resemble-ai/monotonic_align.git /tmp/pip-req-build-wos3w0c1
  Resolved https://github.com/resemble-ai/monotonic_align.git to commit 78b985be210a03d08bc3acc01c4df0442105366f
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting munch
  Downloading munch-4.0.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3



### Load models

In [2]:
%cd StyleTTS2

import torch
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

import random
random.seed(0)

import numpy as np
np.random.seed(0)

import nltk
nltk.download('punkt')

# load packages
import time
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
from nltk.tokenize import word_tokenize

from models import *
from utils import *
from text_utils import TextCleaner
textclenaer = TextCleaner()

%matplotlib inline

device = 'cuda' if torch.cuda.is_available() else 'cpu'

to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def length_to_mask(lengths):
    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
    mask = torch.gt(mask+1, lengths.unsqueeze(1))
    return mask

def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

def compute_style(ref_dicts):
    reference_embeddings = {}
    for key, path in ref_dicts.items():
        wave, sr = librosa.load(path, sr=24000)
        audio, index = librosa.effects.trim(wave, top_db=30)
        if sr != 24000:
            audio = librosa.resample(audio, sr, 24000)
        mel_tensor = preprocess(audio).to(device)

        with torch.no_grad():
            ref = model.style_encoder(mel_tensor.unsqueeze(1))
        reference_embeddings[key] = (ref.squeeze(1), audio)

    return reference_embeddings

# load phonemizer
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True, words_mismatch='ignore')

config = yaml.safe_load(open("Models/LJSpeech/config.yml"))

# load pretrained ASR model
ASR_config = config.get('ASR_config', False)
ASR_path = config.get('ASR_path', False)
text_aligner = load_ASR_models(ASR_path, ASR_config)

# load pretrained F0 model
F0_path = config.get('F0_path', False)
pitch_extractor = load_F0_models(F0_path)

# load BERT model
from Utils.PLBERT.util import load_plbert
BERT_path = config.get('PLBERT_dir', False)
plbert = load_plbert(BERT_path)

model = build_model(recursive_munch(config['model_params']), text_aligner, pitch_extractor, plbert)
_ = [model[key].eval() for key in model]
_ = [model[key].to(device) for key in model]

params_whole = torch.load("Models/LJSpeech/epoch_2nd_00100.pth", map_location='cpu')
params = params_whole['net']

for key in model:
    if key in params:
        print('%s loaded' % key)
        try:
            model[key].load_state_dict(params[key])
        except:
            from collections import OrderedDict
            state_dict = params[key]
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k[7:] # remove `module.`
                new_state_dict[name] = v
            # load params
            model[key].load_state_dict(new_state_dict, strict=False)
#             except:
#                 _load(params[key], model[key])
_ = [model[key].eval() for key in model]

from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule

sampler = DiffusionSampler(
    model.diffusion.diffusion,
    sampler=ADPM2Sampler(),
    sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
    clamp=False
)

def inference(text, noise, diffusion_steps=5, embedding_scale=1):
    text = text.strip()
    text = text.replace('"', '')
    ps = global_phonemizer.phonemize([text])
    ps = word_tokenize(ps[0])
    ps = ' '.join(ps)

    tokens = textclenaer(ps)
    tokens.insert(0, 0)
    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

    with torch.no_grad():
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
        text_mask = length_to_mask(input_lengths).to(tokens.device)

        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

        s_pred = sampler(noise,
              embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,
              embedding_scale=embedding_scale).squeeze(0)

        s = s_pred[:, 128:]
        ref = s_pred[:, :128]

        d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

        x, _ = model.predictor.lstm(d)
        duration = model.predictor.duration_proj(x)
        duration = torch.sigmoid(duration).sum(axis=-1)
        pred_dur = torch.round(duration.squeeze()).clamp(min=1)

        pred_dur[-1] += 5

        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
            c_frame += int(pred_dur[i].data)

        # encode prosody
        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
        out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)),
                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))

    return out.squeeze().cpu().numpy()

def LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=5, embedding_scale=1):
  text = text.strip()
  text = text.replace('"', '')
  ps = global_phonemizer.phonemize([text])
  ps = word_tokenize(ps[0])
  ps = ' '.join(ps)

  tokens = textclenaer(ps)
  tokens.insert(0, 0)
  tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

  with torch.no_grad():
      input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
      text_mask = length_to_mask(input_lengths).to(tokens.device)

      t_en = model.text_encoder(tokens, input_lengths, text_mask)
      bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
      d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

      s_pred = sampler(noise,
            embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,
            embedding_scale=embedding_scale).squeeze(0)

      if s_prev is not None:
          # convex combination of previous and current style
          s_pred = alpha * s_prev + (1 - alpha) * s_pred

      s = s_pred[:, 128:]
      ref = s_pred[:, :128]

      d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

      x, _ = model.predictor.lstm(d)
      duration = model.predictor.duration_proj(x)
      duration = torch.sigmoid(duration).sum(axis=-1)
      pred_dur = torch.round(duration.squeeze()).clamp(min=1)

      pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
      c_frame = 0
      for i in range(pred_aln_trg.size(0)):
          pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
          c_frame += int(pred_dur[i].data)

      # encode prosody
      en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
      F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
      out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)),
                              F0_pred, N_pred, ref.squeeze().unsqueeze(0))

  return out.squeeze().cpu().numpy(), s_pred

/content/StyleTTS2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


177


  params = torch.load(model_path, map_location='cpu')['model']
  params = torch.load(path, map_location='cpu')['net']
  checkpoint = torch.load(log_dir + "/step_" + str(iters) + ".t7", map_location='cpu')
  WeightNorm.apply(module, name, dim)
  params_whole = torch.load("Models/LJSpeech/epoch_2nd_00100.pth", map_location='cpu')


bert loaded
bert_encoder loaded
predictor loaded
decoder loaded
text_encoder loaded
predictor_encoder loaded
style_encoder loaded
diffusion loaded
text_aligner loaded
pitch_extractor loaded
mpd loaded
msd loaded
wd loaded


### Synthesize speech

In [7]:
# @title Input Text { display-mode: "form" }
# synthesize a text
text = "manipande kullu pokkade" # @param {type:"string"}


#### Basic synthesis (5 diffusion steps)

In [8]:
start = time.time()
noise = torch.randn(1,1,256).to(device)
wav = inference(text, noise, diffusion_steps=5, embedding_scale=1)
rtf = (time.time() - start) / (len(wav) / 24000)
print(f"RTF = {rtf:5f}")
import IPython.display as ipd
display(ipd.Audio(wav, rate=24000))

RTF = 0.093104


#### With higher diffusion steps (more diverse)
Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed.

In [9]:
start = time.time()
noise = torch.randn(1,1,256).to(device)
wav = inference(text, noise, diffusion_steps=10, embedding_scale=1)
rtf = (time.time() - start) / (len(wav) / 24000)
print(f"RTF = {rtf:5f}")
import IPython.display as ipd
display(ipd.Audio(wav, rate=24000))

RTF = 0.144669


### Speech expressiveness
The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page.

#### With embedding_scale=1
This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional.

In [10]:
texts = {}
texts['Happy'] = "We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands."
texts['Sad'] = "I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence."
texts['Angry'] = "The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!"
texts['Surprised'] = "I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?"

for k,v in texts.items():
    noise = torch.randn(1,1,256).to(device)
    wav = inference(v, noise, diffusion_steps=10, embedding_scale=1)
    print(k + ": ")
    display(ipd.Audio(wav, rate=24000, normalize=False))

Happy: 




Sad: 


Angry: 


Surprised: 


#### With embedding_scale=2

In [11]:
texts = {}
texts['Happy'] = "We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands."
texts['Sad'] = "I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence."
texts['Angry'] = "The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!"
texts['Surprised'] = "I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?"

for k,v in texts.items():
    noise = torch.randn(1,1,256).to(device)
    wav = inference(v, noise, diffusion_steps=10, embedding_scale=2) # embedding_scale=2 for more pronounced emotion
    print(k + ": ")
    display(ipd.Audio(wav, rate=24000, normalize=False))

Happy: 




Sad: 


Angry: 


Surprised: 


### Long-form generation
This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page.

In [12]:
passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word "Homemade," when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first-class homemade products there is a market in all large cities. All first-class grocers have customers who purchase such goods.''' # @param {type:"string"}

In [13]:
sentences = passage.split('.') # simple split by comma
wavs = []
s_prev = None
for text in sentences:
    if text.strip() == "": continue
    text += '.' # add it back
    noise = torch.randn(1,1,256).to(device)
    wav, s_prev = LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=10, embedding_scale=1.5)
    wavs.append(wav)
display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))



In [15]:
# prompt: open csv /content/drive/MyDrive/Colab Notebooks/Dataset/Dataset.csv

import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset/Dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,text,string,code,Recording
0,0,ಬಾಗಿಲಿಗೆ ಬಂದ ಜ್ಯೋತಿಯನ್ನು ಸ್ವಾಗತಿಸಿ,ಬ್-ಆ-ಗ್-ಇ-ಲ್-ಇ-ಗ್-ಎ- -ಬ್-ಅ-ಂ-ದ್-ಅ- -ಜ್-ಯ್-ಓ-ತ್...,50-3-30-4-55-4-30-16- -50-1-61-45-1- -35-53-23...,kn_in_male\knm_05927_01699170109.wav
1,1,ತೊಡೆಸಂದಿಯ ಅಂಡವಾಯುಗಳು ಪುರುಷರ ಮತ್ತು ಮಹಿಳೆಯರಲ್ಲಿನ...,ತ್-ಒ-ಡ್-ಎ-ಸ್-ಅ-ಂ-ದ್-ಇ-ಯ್-ಅ- -ಅ-ಂ-ಡ್-ಅ-ವ್-ಆ-ಯ್-...,43-22-40-16-59-1-61-45-4-53-1- -1-61-40-1-56-3...,kn_in_male\knm_08025_01815510700.wav
2,2,ನಿಲ್ದಾಣದ ಮೂಲಕ ಮುಕ್ತವಾಗಿ ತೇಲಾಡುತ್ತ ಮಲಗಬಹುದು,ನ್-ಇ-ಲ್-ದ್-ಆ-ಣ್-ಅ-ದ್-ಅ- -ಮ್-ಊ-ಲ್-ಅ-ಕ್-ಅ- -ಮ್-ಉ...,47-4-55-45-3-42-1-45-1- -52-8-55-1-28-1- -52-7...,kn_in_male\knm_00574_01153125164.wav
3,3,ಅಂಡಮಾನಿನ ಪುಲಗ ಹಾಗೂ ಪ್ರಾಚೀನ ಭಾರತದ ವರುಣ ಅಂತರಿಕ್ಷ...,ಅ-ಂ-ಡ್-ಅ-ಮ್-ಆ-ನ್-ಇ-ನ್-ಅ- -ಪ್-ಉ-ಲ್-ಅ-ಗ್-ಅ- -ಹ್-...,1-61-40-1-52-3-47-4-47-1- -48-7-55-1-30-1- -60...,kn_in_male\knm_01114_01558059071.wav
4,4,ಈ ಸಮ್ಮೇಳನ ಕೇಂದ್ರ ಬ್ಯಾಂಕುಗಳೊಳಗೆ ಬೆಳೆಯಬೇಕಾದ ಸಹಕಾ...,ಈ- -ಸ್-ಅ-ಮ್-ಮ್-ಏ-ಳ್-ಅ-ನ್-ಅ- -ಕ್-ಏ-ಂ-ದ್-ರ್-ಅ- -...,5- -59-1-52-52-17-13-1-47-1- -28-17-61-45-54-1...,kn_in_male\knm_00271_00413960012.wav
...,...,...,...,...,...
4395,4395,ಮುಂದಿನ ಬೇಸಗೆಯಲ್ಲಿ ಫಸಲು ಬರುತ್ತದೆ,ಮ್-ಉ-ಂ-ದ್-ಇ-ನ್-ಅ- -ಬ್-ಏ-ಸ್-ಅ-ಗ್-ಎ-ಯ್-ಅ-ಲ್-ಲ್-ಇ...,52-7-61-45-4-47-1- -50-17-59-1-30-16-53-1-55-5...,kn_in_female\knf_01493_00549895262.wav
4396,4396,ಒಂದು ಪೋಷಕವಸ್ತುವಿನ ಕೊರತೆಯಿದ್ದಾಗ,ಒ-ಂ-ದ್-ಉ- -ಪ್-ಓ-ಷ್-ಅ-ಕ್-ಅ-ವ್-ಅ-ಸ್-ತ್-ಉ-ವ್-ಇ-ನ್...,22-61-45-7- -48-23-58-1-28-1-56-1-59-43-7-56-4...,kn_in_female\knf_01796_01447306308.wav
4397,4397,ಆದರೆ ಅಂತರ್ವ್ಯಾಪ್ತಿಗೆ ಮತ್ತೊಂದು ಅರ್ಥವೂ ಉಂಟು,ಆ-ದ್-ಅ-ರ್-ಎ- -ಅ-ಂ-ತ್-ಅ-ರ್-ವ್-ಯ್-ಆ-ಪ್-ತ್-ಇ-ಗ್-ಎ...,3-45-1-54-16- -1-61-43-1-54-56-53-3-48-43-4-30...,kn_in_female\knf_08476_00629542551.wav
4398,4398,ಆಯುರ್ವೇದವನ್ನು ಪ್ರಜಾಪತಿ ದಕ್ಷನಿಗೆ ಬ್ರಹ್ಮ ಉಪದೇಶಿಸಿದ.,ಆ-ಯ್-ಉ-ರ್-ವ್-ಏ-ದ್-ಅ-ವ್-ಅ-ನ್-ನ್-ಉ- -ಪ್-ರ್-ಅ-ಜ್-...,3-53-7-54-56-17-45-1-56-1-47-47-7- -48-54-1-35...,kn_in_female\knf_09696_00446271427.wav


In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# prompt: i want to further train the above model using my dataset

# Assuming you have your dataset prepared and ready to be used for training.
# You'll need to modify the training loop and data loading parts accordingly.

# Example:
# 1. Modify the data loading part to load your dataset.
# 2. Adjust the training loop to iterate through your dataset and update model parameters.
# 3. Define loss functions and optimization methods as needed.

# Example code snippet (placeholder):

# Define your dataset and dataloader
# ...

# Define your training loop
def train(model, optimizer, dataloader, epochs):
  for epoch in range(epochs):
    for batch in dataloader:
      # 1. Prepare input data (e.g., text, audio) and move it to the device.
      # 2. Forward pass: Run the model to get the output.
      # 3. Compute the loss based on the output and target values.
      # 4. Backward pass: Calculate gradients of the loss with respect to model parameters.
      # 5. Update model parameters using the optimizer.
      # ...

      # Example (placeholder):
      input_data = df['text']  # Your input data
      target_data = df['recording'] # Your target data
      output = model(input_data)
      loss = compute_loss(output, target_data)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # Print training progress (optional)
      print(f"Epoch: {epoch}, Loss: {loss.item()}")

# Example usage:
# ... (define optimizer, learning rate, etc.)
train(model, optimizer, dataloader, epochs=10)

# Save the trained model
# ...

# Remember to adjust the code to match your specific dataset and training requirements.

NameError: name 'optimizer' is not defined

In [None]:
# prompt: train(model, optimizer, dataloader, epochs=10) set optimizer and dataloader for this

# Example optimizer and dataloader (you'll need to adapt these based on your model and data)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)  # Adjust learning rate as needed

# Assuming you have a PyTorch Dataset class defined for your data
from torch.utils.data import DataLoader

# Replace 'YourDataset' with your actual dataset class
train_dataset = YourDataset(...)
dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Now you can call the train function with the defined optimizer and dataloader
train(model, optimizer, dataloader, epochs=10)