In [None]:
import hydra
import torch
from audiocraft.models import MultiBandDiffusion
from audiotools import AudioSignal
from huggingface_hub import hf_hub_download

from pflow_encodec.data.tokenizer import EncodecTokenizer, TextTokenizer

In [None]:
def load_model(ckpt_path, device="cpu"):
    ckpt = torch.load(ckpt_path, map_location="cpu")

    model = hydra.utils.instantiate(ckpt["model_config"])
    model.load_state_dict(ckpt["state_dict"])
    model = model.eval().to(device)

    return model, ckpt["data_config"]

In [None]:
ckpt_path = hf_hub_download(repo_id="seastar105/pflow-encodec-ejk", filename="multilingual_base_bs100x4.ckpt")

In [None]:
model, data_config = load_model(ckpt_path, "cuda")

In [None]:
e_prompt = hf_hub_download(repo_id="seastar105/pflow-encodec-ejk", filename="samples/libritts_r_prompt.wav")
j_prompt = hf_hub_download(repo_id="seastar105/pflow-encodec-ejk", filename="samples/jsut_prompt.wav")
k_prompt = hf_hub_download(repo_id="seastar105/pflow-encodec-ejk", filename="samples/kss_prompt.wav")

In [None]:
text_tokenizer = TextTokenizer()

In [None]:
encodec_tokenizer = EncodecTokenizer()

In [None]:
mbd_model = MultiBandDiffusion.get_mbd_24khz(bw=6)

In [None]:
@torch.inference_mode()
def pflow_inference(
    model, text, prompt_path, data_config, cfg_scale=1.0, n_steps=16, ode_method="midpoint", return_latent=False
):
    device = next(model.parameters()).device
    prompt = encodec_tokenizer.encode_file(prompt_path).to(device)
    mean = data_config["mean"]
    std = data_config["std"]
    upscale_ratio = data_config["text2latent_ratio"]

    text_token = text_tokenizer.encode_text(text).to(device).unsqueeze(0)
    prompt = (prompt - mean) / std
    result = model.generate(
        text_token, prompt, cfg_scale=cfg_scale, n_steps=n_steps, ode_method=ode_method, upscale_ratio=upscale_ratio
    )
    result = result * std + mean
    if return_latent:
        return result.cpu()
    recon = encodec_tokenizer.decode_latents(result.to(device=encodec_tokenizer.device, dtype=encodec_tokenizer.dtype))
    return recon.cpu()

In [None]:
@torch.inference_mode()
def mbd_decode(mbd_model, latent):
    codes = encodec_tokenizer.quantize_latents(latent.to(device=encodec_tokenizer.device))
    recon = mbd_model.tokens_to_wav(codes[:, :8, :])
    return recon.cpu()

In [None]:
e_text = "P-Flow encodec is Text-to-Speech model trained on Encodec latent space, using Flow Matching."

In [None]:
latents = pflow_inference(
    model, e_text, e_prompt, data_config, cfg_scale=1.2, n_steps=16, ode_method="midpoint", return_latent=True
)
pflow_result = (
    encodec_tokenizer.decode_latents(latents.to(device=encodec_tokenizer.device, dtype=encodec_tokenizer.dtype))
    .detach()
    .cpu()
)
pflow_signal = AudioSignal(pflow_result, 24000).normalize(-23).ensure_max_of_audio()
pflow_signal.embed()

In [None]:
mbd_recon = mbd_decode(mbd_model, latents)
mbd_signal = AudioSignal(mbd_recon, 24000).normalize(-23).ensure_max_of_audio()
mbd_signal.embed()

In [None]:
j_text = "こんにちは、初めまして。あなたの名前はなんですか？これは音声合成モデルから作られた音声です。"

In [None]:
latents = pflow_inference(
    model, j_text, j_prompt, data_config, cfg_scale=1.2, n_steps=16, ode_method="midpoint", return_latent=True
)
pflow_result = (
    encodec_tokenizer.decode_latents(latents.to(device=encodec_tokenizer.device, dtype=encodec_tokenizer.dtype))
    .detach()
    .cpu()
)
pflow_signal = AudioSignal(pflow_result, 24000).normalize(-23).ensure_max_of_audio()
pflow_signal.embed()

In [None]:
mbd_recon = mbd_decode(mbd_model, latents)
mbd_signal = AudioSignal(mbd_recon, 24000).normalize(-23).ensure_max_of_audio()
mbd_signal.embed()

In [None]:
k_text = "백남준은 미디어 아트의 개척자로서 다양한 테크놀로지를 이용하여 실험적이고 창의적으로 작업했다."

In [None]:
latents = pflow_inference(
    model, k_text, k_prompt, data_config, cfg_scale=1.2, n_steps=16, ode_method="midpoint", return_latent=True
)
pflow_result = (
    encodec_tokenizer.decode_latents(latents.to(device=encodec_tokenizer.device, dtype=encodec_tokenizer.dtype))
    .detach()
    .cpu()
)
pflow_signal = AudioSignal(pflow_result, 24000).normalize(-23).ensure_max_of_audio()
pflow_signal.embed()

In [None]:
mbd_recon = mbd_decode(mbd_model, latents)
mbd_signal = AudioSignal(mbd_recon, 24000).normalize(-23).ensure_max_of_audio()
mbd_signal.embed()

In [None]:
code_text = "There's famous japanese sentence, つきがきれいですね, which means 나는 당신을 사랑합니다."

In [None]:
latents = pflow_inference(
    model, code_text, e_prompt, data_config, cfg_scale=1.2, n_steps=16, ode_method="midpoint", return_latent=True
)
pflow_result = (
    encodec_tokenizer.decode_latents(latents.to(device=encodec_tokenizer.device, dtype=encodec_tokenizer.dtype))
    .detach()
    .cpu()
)
pflow_signal = AudioSignal(pflow_result, 24000).normalize(-23).ensure_max_of_audio()
pflow_signal.embed()

In [None]:
mbd_recon = mbd_decode(mbd_model, latents)
mbd_signal = AudioSignal(mbd_recon, 24000).normalize(-23).ensure_max_of_audio()
mbd_signal.embed()

In [None]:
latents = pflow_inference(
    model, code_text, j_prompt, data_config, cfg_scale=1.2, n_steps=16, ode_method="midpoint", return_latent=True
)
pflow_result = (
    encodec_tokenizer.decode_latents(latents.to(device=encodec_tokenizer.device, dtype=encodec_tokenizer.dtype))
    .detach()
    .cpu()
)
pflow_signal = AudioSignal(pflow_result, 24000).normalize(-23).ensure_max_of_audio()
pflow_signal.embed()

In [None]:
mbd_recon = mbd_decode(mbd_model, latents)
mbd_signal = AudioSignal(mbd_recon, 24000).normalize(-23).ensure_max_of_audio()
mbd_signal.embed()

In [None]:
latents = pflow_inference(
    model, code_text, k_prompt, data_config, cfg_scale=1.2, n_steps=16, ode_method="midpoint", return_latent=True
)
pflow_result = (
    encodec_tokenizer.decode_latents(latents.to(device=encodec_tokenizer.device, dtype=encodec_tokenizer.dtype))
    .detach()
    .cpu()
)
pflow_signal = AudioSignal(pflow_result, 24000).normalize(-23).ensure_max_of_audio()
pflow_signal.embed()

In [None]:
mbd_recon = mbd_decode(mbd_model, latents)
mbd_signal = AudioSignal(mbd_recon, 24000).normalize(-23).ensure_max_of_audio()
mbd_signal.embed()