# Dependencies and Imports

In [1]:
#@title Install dependencies

!pip install -q torchaudio omegaconf

import torch
from pprint import pprint
from omegaconf import OmegaConf
from IPython.display import Audio, display

torch.hub.download_url_to_file('https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml',
                               'latest_silero_models.yml',
                               progress=False)
models = OmegaConf.load('latest_silero_models.yml')

[K     |████████████████████████████████| 1.9 MB 29.0 MB/s 
[K     |████████████████████████████████| 74 kB 3.2 MB/s 
[K     |████████████████████████████████| 112 kB 41.5 MB/s 
[K     |████████████████████████████████| 636 kB 28.0 MB/s 
[?25h  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone


# Demo

In [2]:
# see latest avaiable models
available_languages = list(models.tts_models.keys())
print(f'Available languages {available_languages}')

for lang in available_languages:
    speakers = list(models.tts_models.get(lang).keys())
    print(f'Available speakers for {lang}: {speakers}')

Available languages ['ru', 'en', 'de', 'es', 'fr', 'ba', 'xal', 'tt', 'uz', 'multi']
Available speakers for ru: ['aidar_v2', 'aidar_8khz', 'aidar_16khz', 'baya_v2', 'baya_8khz', 'baya_16khz', 'irina_v2', 'irina_8khz', 'irina_16khz', 'kseniya_v2', 'kseniya_8khz', 'kseniya_16khz', 'natasha_v2', 'natasha_8khz', 'natasha_16khz', 'ruslan_v2', 'ruslan_8khz', 'ruslan_16khz']
Available speakers for en: ['lj_v2', 'lj_8khz', 'lj_16khz']
Available speakers for de: ['thorsten_v2', 'thorsten_8khz', 'thorsten_16khz']
Available speakers for es: ['tux_v2', 'tux_8khz', 'tux_16khz']
Available speakers for fr: ['gilles_v2', 'gilles_8khz', 'gilles_16khz']
Available speakers for ba: ['aigul_v2']
Available speakers for xal: ['erdni_v2']
Available speakers for tt: ['dilyara_v2']
Available speakers for uz: ['dilnavoz_v2']
Available speakers for multi: ['multi_v2']


In [32]:
import torch

language = 'ru'
speaker = 'baya_16khz'
device = torch.device('cpu')
model, symbols, sample_rate, example_text, apply_tts = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                                                      model='silero_tts',
                                                                      language=language,
                                                                      speaker=speaker)
model = model.to(device)  # gpu or cpu

Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master


In [33]:
audio = apply_tts(texts=["Здр+авствуйте, в +этом в+ыпуске мы устан+овим вордпр+есс на х+остинг рег.ру."],
                  model=model,
                  sample_rate=sample_rate,
                  symbols=symbols,
                  device=device)

print(example_text)
display(Audio(audio[0], rate=sample_rate))

В н+едрах т+ундры в+ыдры в г+етрах т+ырят в в+ёдра +ядра к+едров.


## Enhance synthesis with logmmse

In [7]:
!pip install -q logmmse

You can try to enhance synthesized audio with logmmse algorithm, though it could demand parameters tuning for the particular speaker.

In [8]:
import numpy as np
from logmmse import logmmse

enhanced = logmmse(np.array(audio[0]), sample_rate, output_file=None, initial_noise=1, window_size=160, noise_threshold=0.15)
display(Audio(enhanced, rate=sample_rate))