# Live Colab Example

## Dependencies and Imports

In [1]:
#@title Install dependencies

!pip install -q omegaconf

import os
from os.path import exists

if not exists('silero-models'):
  !git clone -q --depth 1 https://github.com/snakers4/silero-models

%cd silero-models

# silero imports
import torch
from omegaconf import OmegaConf
from tts_utils import (init_jit_model, 
                       prepare_tts_model_input,
                       process_tts_model_output)

device = torch.device('cpu')   # you can use any pytorch device
models = OmegaConf.load('models.yml')

# imports for uploading/recording
import numpy as np
import ipywidgets as widgets
from ipywidgets import IntProgress
from IPython.display import Audio, display

# TTS

In [2]:
#@markdown { run: "auto" }

speaker = "Baya [ru]" #@param ["Baya [ru]", "Kseniya [ru]", "Aidar [ru]", "Irina [ru]", "Natasha [ru]", "Ruslan [ru]", "LJ [en]", "Thorsten [de]", "Tux [es]", "Gilles [fr]"]
sample_rate_khz = '8khz' #@param ['8khz', '16khz']

speaker_dict = {"Baya [ru]": ["ru", "baya_{}"],
                "Kseniya [ru]": ["ru", "kseniya_{}"],
                "Aidar [ru]": ["ru", "aidar_{}"],
                "Irina [ru]": ["ru", "irina_{}"],
                "Natasha [ru]": ["ru", "natasha_{}"],
                "Ruslan [ru]": ["ru", "ruslan_{}"],
                "LJ [en]": ["en", "lj_{}"],
                "Thorsten [de]": ["de", "thorsten_{}"],
                "Tux [es]": ["es", "tux_{}"],
                "Gilles [fr]": ["fr", "gilles_{}"]}

sample_rate= 16000 if sample_rate_khz == '16khz' else 8000
lang, speaker_name = speaker_dict[speaker]
print(lang, speaker_name.format(sample_rate_khz))

model_conf = models.tts_models[lang][speaker_name.format(sample_rate_khz)].latest

model = init_jit_model(model_conf.jit, device=device)

ru baya_8khz


In [3]:
text = 'В н+едрах т+ундры в+ыдры в г+етрах т+ырят в в+ёдра +ядра к+едров.' #@param {type: "string"}

def _text_to_speech(text):
    global model_conf, model, sample_rate
    text_padded, orig_ids = prepare_tts_model_input(text, symbols=model_conf.tokenset)
    out, out_lens = model(text_padded)
    audio = process_tts_model_output(out, out_lens, orig_ids, sample_rate)[0]
    display(Audio(audio, rate=sample_rate))
    return audio

In [4]:
audio = _text_to_speech(text)

# PyTorch Example

In [5]:
#@title Install Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q omegaconf

import os
from os.path import exists

if not exists('silero-models'):
  !git clone -q --depth 1 https://github.com/snakers4/silero-models

%cd silero-models

import torch
from omegaconf import OmegaConf
from tts_utils import (init_jit_model, 
                       prepare_tts_model_input,
                       process_tts_model_output)
from IPython.display import display, Audio

In [7]:
models = OmegaConf.load('models.yml')  # all available models are listed in the yml file
device = torch.device('cuda:0')   # you can use any pytorch device
model_conf = models.tts_models.ru.baya_8khz.latest
sample_rate = 8000

model = init_jit_model(model_conf.jit, device=device)

In [8]:
# Synthesize one audio
text = "В н+едрах т+ундры в+ыдры в г+етрах т+ырят в в+ёдра +ядра к+едров."
text_padded, _ = prepare_tts_model_input(text, symbols=model_conf.tokenset)
out, _ = model(text_padded.to(device))
audio = out[0].to('cpu')

display(Audio(audio, rate=sample_rate))

  result = self.forward(*input, **kwargs)


In [9]:
# Synthesize batch of audios
texts = ["К+арл у Кл+ары укр+ал кор+аллы,",
         "Кл+ара у К+арла укр+ала кларн+ет."]

text_padded, orig_ids = prepare_tts_model_input(texts, symbols=model_conf.tokenset)
out, out_lens = model(text_padded.to(device))
audios = process_tts_model_output(out, out_lens, orig_ids, sample_rate)

for i, audio in enumerate(audios):
    print(f'Text: {texts[i]}')
    display(Audio(audio, rate=sample_rate))

Text: К+арл у Кл+ары укр+ал кор+аллы,


Text: Кл+ара у К+арла укр+ала кларн+ет.
