In [6]:
from pathlib import Path

import IPython.display as ipd
import torch

from notebooks.data_analysis_common_utils import phonemize_text
from src.model import commons
from src.model.synthesizer import SynthesizerTrn
from src.params import Params
from src.text.convert import text_to_sequence
from src.utils.checkpoint import load_checkpoint

In [7]:
def get_text(text: str, text_cleaners: list[str], language: str, phonemized: bool, stressed: bool):
    text_norm = text_to_sequence(text, text_cleaners, language, phonemized, stressed)
    text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)

    return text_norm

In [8]:
model = "studio_phoneme_finetune"
params = Params.model_validate_json(
    Path(f"../files/configs/lt_{model}.json").read_text(encoding="utf-8"))

net_g = SynthesizerTrn.from_params(params)
_ = net_g.eval()

In [9]:
load_checkpoint(Path(f"../logs/{model}/G_80000.pth"), net_g, None)

(0.0002, 298)

In [10]:
text = "Penktadienio naktį daug kur trumpai palis"
text = "Gramatiškai veiksmažodis derinamas su veiksniu"
text = "Mano vardas Gabija"
text = "Ar nori eiti su manimi drauge"
text = "Jis pastato savo mašina ir eina iki pastato galo."
text = "Jìs pastãto sàvo mašìna ir̃ eĩna ikì pãstato gãlo."
text = phonemize_text("Jis pastato savo mašina ir eina iki pastato galo.")

stn_tst = get_text(
    text,
    params.data.text_cleaners,
    params.data.language,
    params.data.phonemized,
    params.data.stressed
)

with torch.inference_mode():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=0.667, noise_scale_w=0.5, length_scale=1.0)[0][0, 0]

ipd.display(ipd.Audio(audio.data.float().numpy(), rate=params.data.sampling_rate, normalize=False))