<a href="https://colab.research.google.com/github/rmcpantoja/My-Colab-Notebooks/blob/main/notebooks/Cuaderno_de_s%C3%ADntesis_VITS_TTS_espa%C3%B1ol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<b><font color="pink" size="+2">  Cuaderno de síntesis en español de [VITS TTS](https://github.com/jaywalnut310/vits): Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech.


---


- Cuaderno desarrollado por: [rmcpantoja](https://github.com/rmcpantoja/)
- Decorado por: [Xx_Nessu_xX](https://fakeyou.com/profile/Xx_Nessu_xX)

In [None]:
#@markdown # <font color="pink"> **Instalar software.** 📦
#@markdown ---
#@markdown #### <font color="orange">**Importante: Reinicia el entorno de ejecución al terminar de ejecutarse esta celda. Luego, continúa a la siguiente celda.**
%cd /content
!git clone https://github.com/rmcpantoja/vits.git
%cd vits
!pip install Cython librosa==0.8.0 matplotlib numpy phonemizer num2words scipy tensorboard Unidecode torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 torchtext==0.14.1 torchdata==0.5.1 --extra-index-url https://download.pytorch.org/whl/cu117 -U
#%cd vits
%cd monotonic_align
!mkdir monotonic_align
!python setup.py build_ext --inplace
!apt-get install espeak-ng
%cd ..
!pip install --upgrade gdown

In [None]:
#@markdown # <font color="pink"> **Descargar e Iniciar el modelo TTS.** 💾
#@markdown ---
#@markdown Pon el ID de tu modelo.  <font color="orange">**(Archivo G_0)**
%cd /content/vits
model_drive_id = "Pon la id del archivo g_0 aqu\xED" #@param{type: "string"}
!gdown '{model_drive_id}' -O /content/vits/pretrained.pth
%matplotlib inline
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, Audio, Markdown, Javascript

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm).cuda()
    return text_norm

hps = utils.get_hparams_from_file("./configs/base_es_singlespeaker_22k.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("pretrained.pth", net_g, None)

def run_tts(text, rate = 1, noise_s = 0.667, noise_scale_w = 0.8, auto_play=True):
    stn_tst = get_text(text, hps)
    with torch.no_grad():
        x_tst = stn_tst.cuda().unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
        audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=noise_s, noise_scale_w=noise_scale_w, length_scale=rate)[0][0,0].data.cpu().float().numpy()
    display(Markdown(f"{text}"))
    display(Audio(audio, rate=hps.data.sampling_rate, autoplay=auto_play))
speed_slider = widgets.FloatSlider(
    value=1,
    min=0.25,
    max=4,
    step=0.1,
    description="Escala de velocidad:",
    orientation='horizontal',
)
noise_scale_slider = widgets.FloatSlider(
    value=0.667,
    min=0.25,
    max=4,
    step=0.1,
    description="Escala de resonancia de los fonemas:",
    orientation='horizontal',
)
noise_scale_w_slider = widgets.FloatSlider(
    value=1,
    min=0.25,
    max=4,
    step=0.1,
    description="Noise scale w:",
    orientation='horizontal',
)
play = widgets.Checkbox(
    value=True,
    description="Auto reproducir:",
    disabled=False
)

text_input = widgets.Text(
    value='',
    placeholder="Introduce tu texto aquí:",
    description="Texto para sintetizar",
    layout=widgets.Layout(width='80%')
)

synthesize_button = widgets.Button(
    description="Sintetizar",
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip="Presionando este botón se comenzará a sintetizar el texto.",
    icon='check'
)
def on_synthesize_button_clicked(b):
    text = text_input.value
    rate = speed_slider.value
    noise_scale = noise_scale_slider.value
    noise_scale_w = noise_scale_w_slider.value
    auto_play = play.value
    run_tts(text, rate, noise_scale, noise_scale_w, auto_play)

synthesize_button.on_click(on_synthesize_button_clicked)
display(text_input)
display(speed_slider)
display(noise_scale_slider)
display(noise_scale_w_slider)
display(play)
display(synthesize_button)