# Nepali Text-to-Speech with Tacotron2 and Waveglow


In [1]:
#@title Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
#@title Clone the REPO
import os
from os.path import exists, join, basename, splitext

git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  # clone and install
  !git clone -q --recursive {git_repo_url}
  !cd {project_name}/waveglow && git checkout 2fd4e63
  !pip install -q librosa unidecode
  !pip install -q --upgrade gdown
  
import sys
sys.path.append(join(project_name, 'waveglow/'))
sys.path.append(project_name)
import time
import gdown
import matplotlib
import matplotlib.pylab as plt
plt.rcParams["axes.grid"] = False

In [28]:
#@title Download Pretrained model

tacotron2_pretrained_model = '/content/drive/MyDrive/colab/outdir/Shruti_22kHz_45epoch'#@param {type:"string"}
waveglow_pretrained_model = '/content/drive/MyDrive/colab/outdir/waveglow_256channels_ljs_v3.pt'#@param {type:"string"}
# tacotron2_pretrained_model = 'tacotron2_statedict.pkl'
# if not exists(tacotron2_pretrained_model):
#   # download the Tacotron2 pretrained model
#   gdown.download('https://drive.google.com/file/d/1S3Y6EK_wtaJ23nxnkGwmkBQvjEh-_wkV/view?usp=sharing', tacotron2_pretrained_model, quiet=False)
# waveglow_pretrained_model = 'waveglow_old.pt'
# if not exists(waveglow_pretrained_model):
#   # download the Waveglow pretrained model  
#   gdown.download('https://drive.google.com/uc?id=1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx', waveglow_pretrained_model, quiet=False)

In [29]:
#@title Creating Hyperparameters

from text import symbols

class HParams:
    def __init__(self) -> None:
        self.epochs=500
        self.iters_per_checkpoint=1000
        self.seed=1234
        self.dynamic_loss_scaling=True
        self.fp16_run=False
        self.distributed_run=False
        self.dist_backend="nccl"
        self.dist_url="tcp://localhost:54321"
        self.cudnn_enabled=True
        self.cudnn_benchmark=False
        self.ignore_layers=['embedding.weight']

        ################################
        # Data Parameters             #
        ################################
        self.load_mel_from_disk=False
        self.training_files='./filelists/train_list.txt'
        self.validation_files='./filelists/val_list.txt'
        self.text_cleaners=['transliteration_cleaners']

        ################################
        # Audio Parameters             #
        ################################
        self.max_wav_value=32768.0
        self.sampling_rate=22050
        self.filter_length=1024
        self.hop_length=256
        self.win_length=1024
        self.n_mel_channels=80
        self.mel_fmin=0.0
        self.mel_fmax=8000.0

        ################################
        # Model Parameters             #
        ################################
        self.n_symbols=len(symbols)
        self.symbols_embedding_dim=512

        # Encoder parameters
        self.encoder_kernel_size=5
        self.encoder_n_convolutions=3
        self.encoder_embedding_dim=512

        # Decoder parameters
        self.n_frames_per_step=1  # currently only 1 is supported
        self.decoder_rnn_dim=1024
        self.prenet_dim=256
        self.max_decoder_steps=1000
        self.gate_threshold=0.5
        self.p_attention_dropout=0.1
        self.p_decoder_dropout=0.1

        # Attention parameters
        self.attention_rnn_dim=1024
        self.attention_dim=128

        # Location Layer parameters
        self.attention_location_n_filters=32
        self.attention_location_kernel_size=31

        # Mel-post processing network parameters
        self.postnet_embedding_dim=512
        self.postnet_kernel_size=5
        self.postnet_n_convolutions=5

        ################################
        # Optimization Hyperparameters #
        ################################
        self.use_saved_learning_rate=False
        self.learning_rate=1e-3
        self.weight_decay=1e-6
        self.grad_clip_thresh=1.0
        self.batch_size=8
        self.mask_padding=True  # set model's padded outputs to padded values

hparams = HParams()

In [64]:
#@title Initialize Tacotron2 and Waveglow
%matplotlib inline
import IPython.display as ipd
import numpy as np
import torch

from model import Tacotron2
from layers import TacotronSTFT
from audio_processing import griffin_lim
from text import text_to_sequence
from denoiser import Denoiser

# Load Tacotron2 (run this cell every time you change the model)
hparams.sampling_rate = 22050 # Don't change this
hparams.max_decoder_steps = 10000 # How long the audio will be before it cuts off (1000 is about 11 seconds)
hparams.gate_threshold = 0.1 # Model must be 90% sure the clip is over before ending generation (the higher this number is, the more likely that the AI will keep generating until it reaches the Max Decoder Steps)
model = Tacotron2(hparams)
model.load_state_dict(torch.load(tacotron2_pretrained_model)['state_dict'])
_ = model.cuda().eval().half()

# Load WaveGlow
waveglow = torch.load(waveglow_pretrained_model)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)



In [60]:
#@title Remove punctuations
thisdict = {}
def ARPA(text, punctuation=r"!?,।.;", EOS_Token=True):
    out = ''
    for word_ in text.split(" "):
        word=word_; end_chars = ''
        while any(elem in word for elem in punctuation) and len(word) > 1:
            if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
            else: break
        try:
            word_arpa = thisdict[word.upper()]
            word = "{" + str(word_arpa) + "}"
        except KeyError: pass
        out = (out + " " + word + end_chars).strip()
    if EOS_Token and out[-1] != ";": out += ";"
    return out

Now convert the text into mel spectrogram using Tacotron2 and plot it:

In [68]:
#@title Synthesize a text

text = '\u0924\u094D\u092F\u094B \u0930\u093E\u0924\u094D\u0930\u093F \u092A\u0928\u093F - \u0938\u0927\u0948\u0901\u0915\u094B \u091C\u0938\u094D\u0924\u094B \u0938\u093E\u0927\u093E\u0930\u0923 \u0930\u093E\u0924\u094D\u0930\u093F \u0925\u093F\u092F\u094B ?'#@param {type:"string"}
sigma = 0.8
denoise_strength = 0.324
raw_input = True # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing.
                  # should be True if synthesizing a non-English language

for i in text.split("\n"):
    if len(i) < 1: continue;
    print(i)
    if raw_input:
        if i[-1] != ";": i=i+";" 
    else: i = ARPA(i)
    print(i)
    with torch.no_grad(): # save VRAM by not including gradients
        sequence = np.array(text_to_sequence(i, ['transliteration_cleaners']))[None, :]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
        # plot_data((mel_outputs_postnet, alignments))

त्यो रात्रि पनि - सधैँको जस्तो साधारण रात्रि थियो ?
त्यो रात्रि पनि - सधैँको जस्तो साधारण रात्रि थियो ?;


Finally, we can convert the generated mel spectrogram into an audio:

In [75]:
audio = waveglow.infer(mel_outputs_postnet.half().data.cuda(), sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

In [76]:
# remove waveglow bias
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate)