In [None]:
from pathlib import Path

import numpy as np
import soundfile as sf
import torch
from tqdm.notebook import tqdm

from notebooks.data_analysis_common_utils import (
    preprocess_text_for_synthesis,
    seed_everything,
    phonemize_text,
)
from src.model.synthesizer import SynthesizerTrn
from src.params import Params
from src.utils.checkpoint import load_checkpoint

### Synthesize plain text samples

In [None]:
model = "studio_plain_finetune"
params = Params.model_validate_json(
    Path(f"../files/configs/lt_{model}.json").read_text(encoding="utf-8")
)

sr = params.data.sampling_rate
silence = np.zeros(sr, dtype=np.float32)  # 1 second of silence

net_g = SynthesizerTrn.from_params(params)
_ = net_g.eval()
load_checkpoint(Path(f"../logs/{model}/G_80000.pth"), net_g, None)

output_dir = Path(f"../files/{model}_clips")
output_dir.mkdir(exist_ok=True)

with open("../files/eval_sentences_plain.txt", "r", encoding="utf-8") as file:
    texts = [line.strip() for line in file if line.strip()]

for i, text in enumerate(tqdm(texts, desc="Generating audio"), start=1):
    seed_everything(params.train.seed)
    src = preprocess_text_for_synthesis(
        text,
        params.data.text_cleaners,
        params.data.language,
        params.data.phonemized,
        params.data.stressed
    )

    with torch.inference_mode():
        x_src = src.unsqueeze(0)
        x_src_lengths = torch.LongTensor([src.size(0)])
        audio = net_g.infer(x_src, x_src_lengths, noise_scale=0.667, noise_scale_w=0.5, length_scale=1.0)[0][0, 0]
        audio_np = audio.data.float().numpy()
        # pad silence before & after
        padded_audio = np.concatenate([silence, audio_np, silence], axis=0)
        filename = output_dir / f"sentence_{i:03d}.wav"
        sf.write(filename, padded_audio, params.data.sampling_rate)

### Synthesize phoneme text samples

In [None]:
model = "studio_phoneme_finetune"
params = Params.model_validate_json(
    Path(f"../files/configs/lt_{model}.json").read_text(encoding="utf-8")
)

net_g = SynthesizerTrn.from_params(params)
_ = net_g.eval()
load_checkpoint(Path(f"../logs/{model}/G_80000.pth"), net_g, None)

output_dir = Path(f"../files/{model}_clips")
output_dir.mkdir(exist_ok=True)

with open("../files/eval_sentences_plain.txt", "r", encoding="utf-8") as file:
    texts = [line.strip() for line in file if line.strip()]

for i, text in enumerate(tqdm(texts, desc="Generating audio"), start=1):
    seed_everything(params.train.seed)
    src = preprocess_text_for_synthesis(
        phonemize_text(text),
        params.data.text_cleaners,
        params.data.language,
        params.data.phonemized,
        params.data.stressed
    )

    with torch.inference_mode():
        x_src = src.unsqueeze(0)
        x_src_lengths = torch.LongTensor([src.size(0)])
        audio = net_g.infer(x_src, x_src_lengths, noise_scale=0.667, noise_scale_w=0.5, length_scale=1.0)[0][0, 0]
        audio_np = audio.data.float().numpy()
        # pad silence before & after
        padded_audio = np.concatenate([silence, audio_np, silence], axis=0)
        filename = output_dir / f"sentence_{i:03d}.wav"
        sf.write(filename, padded_audio, params.data.sampling_rate)

### Synthesize stressed text samples

In [None]:
model = "studio_stressed_finetune"
params = Params.model_validate_json(
    Path(f"../files/configs/lt_{model}.json").read_text(encoding="utf-8")
)

net_g = SynthesizerTrn.from_params(params)
_ = net_g.eval()
load_checkpoint(Path(f"../logs/{model}/G_80000.pth"), net_g, None)

for source_type in ["transformer", "vdu"]:
    output_dir = Path(f"../files/{model}_{source_type}_clips")
    output_dir.mkdir(exist_ok=True)

    with open(f"../files/eval_sentences_stressed_{source_type}.txt", "r", encoding="utf-8") as file:
        texts = [line.strip() for line in file if line.strip()]

    for i, text in enumerate(tqdm(texts, desc="Generating audio"), start=1):
        seed_everything(params.train.seed)
        src = preprocess_text_for_synthesis(
            text,
            params.data.text_cleaners,
            params.data.language,
            params.data.phonemized,
            params.data.stressed
        )

        with torch.inference_mode():
            x_src = src.unsqueeze(0)
            x_src_lengths = torch.LongTensor([src.size(0)])
            audio = net_g.infer(x_src, x_src_lengths, noise_scale=0.667, noise_scale_w=0.5, length_scale=1.0)[0][0, 0]
            audio_np = audio.data.float().numpy()
            # pad silence before & after
            padded_audio = np.concatenate([silence, audio_np, silence], axis=0)
            filename = output_dir / f"sentence_{i:03d}.wav"
            sf.write(filename, padded_audio, params.data.sampling_rate)