#Running MMS-TTS inference in Colab
In this notebook, we give an example on how to run text-to-speech inference using MMS TTS models.

By default, we run inference on a GPU.  If you want to perform CPU inference, go to "Runtiime" menu -> "Change runtime type" and set "Hardware accelerator" to "None" before running.

## 1. Preliminaries
This section installs necessary python packages for the other sections. Run it first.

In [10]:
%pwd
!git clone https://github.com/jaywalnut310/vits.git
!python --version
%cd vits/

!pip install Cython==0.29.21
!pip install librosa==0.8.0
!pip install phonemizer==2.2.1
!pip install scipy
!pip install numpy
!pip install torch
!pip install torchvision
!pip install matplotlib
!pip install Unidecode==1.1.1

%cd monotonic_align/
%mkdir monotonic_align
!python3 setup.py build_ext --inplace
%cd ../
%pwd

Cloning into 'vits'...
remote: Enumerating objects: 81, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 81 (delta 21), reused 21 (delta 21), pack-reused 26[K
Receiving objects: 100% (81/81), 3.33 MiB | 7.14 MiB/s, done.
Resolving deltas: 100% (22/22), done.
Python 3.10.12
/content/vits/vits
/content/vits/vits/monotonic_align
Compiling core.pyx because it changed.
[1/1] Cythonizing core.pyx
  tree = Parsing.p_module(s, pxd, full_module_name)
[01m[Kcore.c:[m[K In function ‘[01m[K__Pyx_InitGlobals[m[K’:
16766 | [01;35m[KPyEval_InitThreads[m[K();
      | [01;35m[K^~~~~~~~~~~~~~~~~~[m[K
In file included from [01m[K/usr/include/python3.10/Python.h:130[m[K,
                 from [01m[Kcore.c:16[m[K:
[01m[K/usr/include/python3.10/ceval.h:122:37:[m[K [01;36m[Knote: [m[Kdeclared here
  122 | Py_DEPRECATED(3.9) PyAPI_FUNC(void) [01;36m[KPyEval_InitThreads[m[K(void);
      |            

'/content/vits/vits'

## 2. Choose a language and download its checkpoint
Find the ISO code for your target language [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html). You can find more details about the languages we currently support for TTS in this [table](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html).

In [11]:
from IPython.display import Audio
import os
import re
import glob
import json
import tempfile
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import numpy as np
import commons
import utils
import argparse
import subprocess
import subprocess
import locale
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from scipy.io.wavfile import write


In [12]:

locale.getpreferredencoding = lambda: "UTF-8"

def download(lang, tgt_dir="./"):
  lang_fn, lang_dir = os.path.join(tgt_dir, lang+'.tar.gz'), os.path.join(tgt_dir, lang)
  cmd = ";".join([
        f"wget https://dl.fbaipublicfiles.com/mms/tts/{lang}.tar.gz -O {lang_fn}",
        f"tar zxvf {lang_fn}"
  ])
  print(f"Download model for language: {lang}")
  subprocess.check_output(cmd, shell=True)
  print(f"Model checkpoints in {lang_dir}: {os.listdir(lang_dir)}")
  return lang_dir

LANG = "eng"
ckpt_dir = download(LANG)

Download model for language: eng
Model checkpoints in ./eng: ['G_100000.pth', 'config.json', 'vocab.txt']


## 3. Load the checkpoint

In [13]:

def preprocess_char(text, lang=None):
    """
    Special treatement of characters in certain languages
    """
    print(lang)
    if lang == 'ron':
        text = text.replace("ț", "ţ")
    return text

class TextMapper(object):
    def __init__(self, vocab_file):
        self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
        self.SPACE_ID = self.symbols.index(" ")
        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

    def text_to_sequence(self, text, cleaner_names):
        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
        Args:
        text: string to convert to a sequence
        cleaner_names: names of the cleaner functions to run the text through
        Returns:
        List of integers corresponding to the symbols in the text
        '''
        sequence = []
        clean_text = text.strip()
        for symbol in clean_text:
            symbol_id = self._symbol_to_id[symbol]
            sequence += [symbol_id]
        return sequence

    def uromanize(self, text, uroman_pl):
        iso = "xxx"
        with tempfile.NamedTemporaryFile() as tf, \
             tempfile.NamedTemporaryFile() as tf2:
            with open(tf.name, "w") as f:
                f.write("\n".join([text]))
            cmd = f"perl " + uroman_pl
            cmd += f" -l {iso} "
            cmd +=  f" < {tf.name} > {tf2.name}"
            os.system(cmd)
            outtexts = []
            with open(tf2.name) as f:
                for line in f:
                    line =  re.sub(r"\s+", " ", line).strip()
                    outtexts.append(line)
            outtext = outtexts[0]
        return outtext

    def get_text(self, text, hps):
        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
        if hps.data.add_blank:
            text_norm = commons.intersperse(text_norm, 0)
        text_norm = torch.LongTensor(text_norm)
        return text_norm

    def filter_oov(self, text):
        val_chars = self._symbol_to_id
        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
        print(f"text after filtering OOV: {txt_filt}")
        return txt_filt

def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
    txt = preprocess_char(txt, lang=lang)
    is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
    if is_uroman:
        with tempfile.TemporaryDirectory() as tmp_dir:
            if uroman_dir is None:
                cmd = f"git clone git@github.com:isi-nlp/uroman.git {tmp_dir}"
                print(cmd)
                subprocess.check_output(cmd, shell=True)
                uroman_dir = tmp_dir
            uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
            print(f"uromanize")
            txt = text_mapper.uromanize(txt, uroman_pl)
            print(f"uroman text: {txt}")
    txt = txt.lower()
    txt = text_mapper.filter_oov(txt)
    return txt



In [14]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Run inference with {device}")
vocab_file = f"{ckpt_dir}/vocab.txt"
config_file = f"{ckpt_dir}/config.json"
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
hps = utils.get_hparams_from_file(config_file)
text_mapper = TextMapper(vocab_file)
net_g = SynthesizerTrn(
    len(text_mapper.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
net_g.to(device)
_ = net_g.eval()

g_pth = f"{ckpt_dir}/G_100000.pth"
print(f"load {g_pth}")

_ = utils.load_checkpoint(g_pth, net_g, None)

Run inference with cuda
load ./eng/G_100000.pth


## 4. Generate an audio given text
Specify the sentence you want to synthesize and generate the audio

In [15]:
txt = "What you think of yourself matters much more than what others think about you. You should think wisely."

print(f"text: {txt}")
txt = preprocess_text(txt, text_mapper, hps, lang=LANG)
stn_tst = text_mapper.get_text(txt, hps)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    hyp = net_g.infer(
        x_tst, x_tst_lengths, noise_scale=.667,
        noise_scale_w=0.8, length_scale=1.0
    )[0][0,0].cpu().float().numpy()

print(f"Generated audio")
Audio(hyp, rate=hps.data.sampling_rate)

text: What you think of yourself matters much more than what others think about you. You should think wisely.
eng
text after filtering OOV: what you think of yourself matters much more than what others think about you you should think wisely
Generated audio


In [16]:
txt = "There is no need of any competition with anybody. You are yourself and as you are, you are perfectly good. Accept yourself."

print(f"text: {txt}")
txt = preprocess_text(txt, text_mapper, hps, lang=LANG)
stn_tst = text_mapper.get_text(txt, hps)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    hyp = net_g.infer(
        x_tst, x_tst_lengths, noise_scale=.667,
        noise_scale_w=0.8, length_scale=1.0
    )[0][0,0].cpu().float().numpy()

print(f"Generated audio")
Audio(hyp, rate=hps.data.sampling_rate)

text: There is no need of any competition with anybody. You are yourself and as you are, you are perfectly good. Accept yourself.
eng
text after filtering OOV: there is no need of any competition with anybody you are yourself and as you are you are perfectly good accept yourself
Generated audio


#For The hindi Sentence

In [17]:
LANG = "hin"
ckpt_dir = download(LANG)

Download model for language: hin
Model checkpoints in ./hin: ['G_100000.pth', 'config.json', 'vocab.txt']


In [18]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Run inference with {device}")
vocab_file = f"{ckpt_dir}/vocab.txt"
config_file = f"{ckpt_dir}/config.json"
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
hps = utils.get_hparams_from_file(config_file)
text_mapper = TextMapper(vocab_file)
net_g = SynthesizerTrn(
    len(text_mapper.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
net_g.to(device)
_ = net_g.eval()

g_pth = f"{ckpt_dir}/G_100000.pth"
print(f"load {g_pth}")

_ = utils.load_checkpoint(g_pth, net_g, None)

Run inference with cuda
load ./hin/G_100000.pth


In [19]:
txt = "आप क्या सोचते हैं अपने बारे में ज्यादा मायने रखता है कि दूसरे क्या सोचते हैं आपके बारे में। इसलिए आपको समझदारी से सोचना चाहिए।"

print(f"text: {txt}")
txt = preprocess_text(txt, text_mapper, hps, lang=LANG)
stn_tst = text_mapper.get_text(txt, hps)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    hyp = net_g.infer(
        x_tst, x_tst_lengths, noise_scale=.667,
        noise_scale_w=0.8, length_scale=1.0
    )[0][0,0].cpu().float().numpy()

print(f"Generated audio")
Audio(hyp, rate=hps.data.sampling_rate)

text: आप क्या सोचते हैं अपने बारे में ज्यादा मायने रखता है कि दूसरे क्या सोचते हैं आपके बारे में। इसलिए आपको समझदारी से सोचना चाहिए।
hin
text after filtering OOV: आप क्या सोचते हैं अपने बारे में ज्यादा मायने रखता है कि दूसरे क्या सोचते हैं आपके बारे में इसलिए आपको समझदारी से सोचना चाहिए
Generated audio


In [20]:
txt = "किसी से किसी भी तरह की प्रतिस्पर्धा की आवश्यकता नहीं है। आप स्वयं में ही जैसे हैं एकदम सही हैं। अपने को स्वीकारिये।"

print(f"text: {txt}")
txt = preprocess_text(txt, text_mapper, hps, lang=LANG)
stn_tst = text_mapper.get_text(txt, hps)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    hyp = net_g.infer(
        x_tst, x_tst_lengths, noise_scale=.667,
        noise_scale_w=0.8, length_scale=1.0
    )[0][0,0].cpu().float().numpy()

print(f"Generated audio")
Audio(hyp, rate=hps.data.sampling_rate)

text: किसी से किसी भी तरह की प्रतिस्पर्धा की आवश्यकता नहीं है। आप स्वयं में ही जैसे हैं एकदम सही हैं। अपने को स्वीकारिये।
hin
text after filtering OOV: किसी से किसी भी तरह की प्रतिस्पर्धा की आवश्यकता नहीं है आप स्वयं में ही जैसे हैं एकदम सही हैं अपने को स्वीकारिये
Generated audio
