In [None]:
import torch
torch.cuda.empty_cache()

# Dataset test

In [None]:
import torch
from torch.utils.data import DataLoader

from dataset.libritts_r import LibriTTSRDataset
from dataset.length_bucket_sampler import LengthBucketSampler
from dataset.collate import collate_fn


root = "/mnt/d/LibriTTS-R/LibriTTS-R-16000"

train_data = LibriTTSRDataset(root, "train")
valid_data = LibriTTSRDataset(root, "valid")
test_data  = LibriTTSRDataset(root, "test")

print(len(train_data))
print(len(valid_data))
print(len(test_data))

print(train_data[0].keys())

In [None]:
train_lengths = [sample["num_units"] for sample in train_data.metadata]

train_sampler = LengthBucketSampler(
    lengths=train_lengths,
    batch_size=4,        # small batch for testing
    bucket_size=100,     # smaller bucket for debugging
    shuffle=True
)

train_sampler

In [None]:
train_loader = DataLoader(
    train_data,
    batch_sampler=train_sampler,  # NOT batch_size
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True
)

valid_loader = DataLoader(
    valid_data,
    batch_size=4,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2
)

test_loader = DataLoader(
    test_data,
    batch_size=4,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2
)


In [None]:
batch = next(iter(train_loader))

print(batch["wav"].shape)
print(batch["wav_lengths"])
print(batch["units"].shape)
print(batch["unit_lengths"])
print(batch["speaker_emb"].shape)
print(batch["emotion_emb"].shape)


In [None]:
assert batch["wav_lengths"][0] == batch["wav"].shape[1]
assert batch["unit_lengths"][0] == batch["units"].shape[1]

In [None]:
for i, batch in enumerate(train_loader):
    print(i, batch["wav"].shape)
    if i == 3:
        break

In [None]:
for i, batch in zip(range(10), train_loader):
    print(batch["unit_lengths"])

# ECAPA & Emotion2Vec tests

In [None]:
from models.ecapa import ECAPA
from models.emotion2vec import Emotion2Vec

wav_path = "/mnt/d/LibriTTS-R/LibriTTS-R-16000/resampled_audio/test-clean/1089/134686/1089_134686_000034_000005.wav"

ecapa_tdnn = ECAPA()
embeddings = ecapa_tdnn.extract_speaker_embeddings(wav_path)

print(embeddings)
print(embeddings.shape)


In [None]:
emotion2vec = Emotion2Vec()
emotion_embeddings = emotion2vec.extract_emotion_embeddings(wav_path)
print(emotion_embeddings)
print(emotion_embeddings.shape)


In [None]:
type(emotion_embeddings)

In [None]:
import torch

combined_embedd = torch.cat([embeddings, emotion_embeddings], dim=-1)

print(combined)
print(combined.shape)

# HiFi-GAN speech synthesis test

In [3]:
import torch
import IPython.display as ipd
from asrp import Code2Speech

speech_units_path = "/mnt/d/LibriTTS-R/LibriTTS-R-16000/unit_embeddings/test-clean/4992/41797/4992_41797_000025_000006.pt"
speech_units = torch.load(speech_units_path)
speech_units = speech_units.tolist()

hifigan = Code2Speech(
    '/home/cynthia/modified-hifi-gan-voice-cloning/pretrained_models/hifigan/g_00500000', 
    vocoder='hifigan', 
    end_tok=999, 
    code_begin_pad=0
)

# play on notebook
ipd.Audio(data=hifigan(speech_units), autoplay=False, rate=16000)

In [4]:
import json
from pathlib import Path

import torch
import soundfile as sf

from models.hifigan_generator import UnitHiFiGANGenerator
from env import AttrDict

device = "cuda" if torch.cuda.is_available() else "cpu"

speech_units_path = Path("/mnt/d/LibriTTS-R/LibriTTS-R-16000/unit_embeddings/test-clean/4992/41797/4992_41797_000025_000006.pt")
pretrained_model_path = Path("/home/cynthia/modified-hifi-gan-voice-cloning/pretrained_models/hifigan/g_00500000")
config_path = Path("/home/cynthia/modified-hifi-gan-voice-cloning/configs/base_hifigan_config.json")
output_path = Path("/home/cynthia/modified-hifi-gan-voice-cloning/audio_samples/test_hifigan_output.wav")


with open(config_path) as file:
    hifigan_config = json.load(file)

hifigan_config = AttrDict(hifigan_config)

hifigan_generator = UnitHiFiGANGenerator(config=hifigan_config, use_film=False)
hifigan_generator.to(device)

checkpoint = torch.load(pretrained_model_path, map_location=device)

if "generator" in checkpoint:
    checkpoint = checkpoint["generator"]

hifigan_generator.load_state_dict(checkpoint, strict=False)

hifigan_generator.eval()

if hasattr(hifigan_generator, "remove_weight_norm"):
    hifigan_generator.remove_weight_norm()

speech_units = torch.load(speech_units_path)
speech_units = speech_units.unsqueeze(0).to(device)

with torch.inference_mode():
    audio = hifigan_generator(speech_units)

audio = audio.squeeze().cpu()

sample_rate = 16000
ipd.Audio(data=audio, autoplay=False, rate=sample_rate)

# sf.write(output_path.as_posix(), audio.numpy(), sample_rate)
# print("Saved:", output_path)

In [1]:
# --- 0) Imports & setup ---
import json
from pathlib import Path

import torch
import torch.nn.functional as F
import soundfile as sf

from models.ecapa import ECAPA
from models.emotion2vec import Emotion2Vec
from models.hifigan_generator import UnitHiFiGANGenerator
from env import AttrDict

device = "cuda" if torch.cuda.is_available() else "cpu"

# --- 1) Paths ---
wav_path = Path("/mnt/d/LibriTTS-R/LibriTTS-R-16000/normalized_audio/test-clean/4992/41797/4992_41797_000025_000006.wav")
units_path = Path("/mnt/d/LibriTTS-R/LibriTTS-R-16000/unit_embeddings/test-clean/4992/41797/4992_41797_000025_000006.pt")

pretrained_model_path = Path("/home/cynthia/modified-hifi-gan-voice-cloning/checkpoints/best_model.pt")
config_path = Path("/home/cynthia/modified-hifi-gan-voice-cloning/configs/base_hifigan_config.json")

output_path = Path("/home/cynthia/modified-hifi-gan-voice-cloning/audio_samples/test_hifigan_film.wav")

# --- 2) Extract conditioning embeddings (speaker, emotion) ---
# --- Speaker ---
ecapa_tdnn = ECAPA()

speaker_emb = ecapa_tdnn.extract_speaker_embeddings(wav_path.as_posix())  # shape: [D_s] or [1, D_s]
speaker_emb = torch.as_tensor(speaker_emb, dtype=torch.float32, device=device)
speaker_emb = speaker_emb.squeeze()

if speaker_emb.dim() == 1:
    speaker_emb = speaker_emb.unsqueeze(0)

# --- Emotion ---
emotion2vec = Emotion2Vec()

emotion_emb = emotion2vec.extract_emotion_embeddings(wav_path.as_posix())  # [D_e] or [1, D_e]
emotion_emb = torch.as_tensor(emotion_emb, dtype=torch.float32, device=device)
emotion_emb = emotion_emb.squeeze()

if emotion_emb.dim() == 1:
    emotion_emb = emotion_emb.unsqueeze(0)

print("speaker_emb:", speaker_emb.shape)
print("emotion_emb:", emotion_emb.shape)

D_s = speaker_emb.shape[-1]
D_e = emotion_emb.shape[-1]
film_cond_dim = D_s + D_e   # must match FiLM(cond_dim=...) used by the generator

# --- 3) Load config and enable FiLM ---
with open(config_path) as f:
    cfg = json.load(f)

# Ensure required FiLM keys exist / are correct
cfg["film_cond_dim"] = film_cond_dim            # 👈 concat(speaker, emotion)
cfg["use_film_mlp"] = True                      # optional: MLP FiLM (more expressive)
cfg["film_hidden_dim"] = 256                    # optional
cfg["film_dropout_p"] = 0.1                     # optional

h = AttrDict(cfg)

# --- 4) Build generator with FiLM turned on ---
generator = UnitHiFiGANGenerator(config=h, use_film=True).to(device)

ckpt = torch.load(pretrained_model_path, map_location=device)
if "generator" in ckpt:
    ckpt = ckpt["generator"]

# ✅ Drop condition-projection weights (speaker-only)
filtered_ckpt = {}
for k, v in ckpt.items():
    if "cond_proj" in k:          # skip speaker-only conditioning
        print(f"SKIP  {k}  (shape changed)")
        continue
    filtered_ckpt[k] = v

# ✅ Load all compatible weights
missing, unexpected = generator.load_state_dict(filtered_ckpt, strict=False)

print("\n=== CHECKPOINT DIAGNOSTICS ===")
print("Missing keys (expected, from new FiLM layers):")
for k in missing:
    print("  ", k)

print("Unexpected keys (unused, rare):")
for k in unexpected:
    print("  ", k)

generator.eval()
if hasattr(generator, "remove_weight_norm"):
    generator.remove_weight_norm()

# --- 5) Load unit IDs and ensure correct type/shape ---
units = torch.load(units_path)            # expect LongTensor [T] of unit IDs
if units.dim() == 1:
    units = units.unsqueeze(0)            # [1, T]
units = units.to(device)
if units.dtype != torch.long:
    units = units.long()                  # dict() expects Long indices

# --- 6) (Optional) Quick FiLM sanity hook to see modulation effect ---
film_means = {}
def make_hook(i):
    def _hook(module, x_in, x_out):
        # log activation magnitude (post-FiLM)
        film_means[i] = float(x_out.abs().mean().detach().cpu())
    return _hook

if getattr(generator, "film_layers", None):
    for i, fl in enumerate(generator.film_layers):
        fl.register_forward_hook(make_hook(i))

# --- 7) Run inference with conditioning ---
with torch.inference_mode():
    audio = generator(units, speaker=speaker_emb, emotion=emotion_emb)  # 👈 pass both
audio = audio.squeeze().detach().cpu()

# --- 8) Save & print simple diagnostics ---
sf.write(output_path.as_posix(), audio.numpy(), 16000)
print("Saved:", output_path)
if film_means:
    print("FiLM layer | mean(|activation|) after FiLM")
    for i in sorted(film_means.keys()):
        print(f"{i:9d} | {film_means[i]:.6f}")


  available_backends = torchaudio.list_audio_backends()
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/home/cynthia/modified-hifi-gan-voice-cloning/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml'
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-ecapa-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/home/cynthia/modified-hifi-gan-voice-cloning/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.paramet

funasr version: 1.2.7.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.2.7


Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 86579.67it/s]




rtf_avg: 0.018: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  5.98it/s]                                                                                      


speaker_emb: torch.Size([1, 192])
emotion_emb: torch.Size([1, 768])
SKIP  cond_proj.weight  (shape changed)
SKIP  cond_proj.bias  (shape changed)

=== CHECKPOINT DIAGNOSTICS ===
Missing keys (expected, from new FiLM layers):
   film_layers.0.net.0.weight
   film_layers.0.net.0.bias
   film_layers.0.net.3.weight
   film_layers.0.net.3.bias
   film_layers.1.net.0.weight
   film_layers.1.net.0.bias
   film_layers.1.net.3.weight
   film_layers.1.net.3.bias
   film_layers.2.net.0.weight
   film_layers.2.net.0.bias
   film_layers.2.net.3.weight
   film_layers.2.net.3.bias
   film_layers.3.net.0.weight
   film_layers.3.net.0.bias
   film_layers.3.net.3.weight
   film_layers.3.net.3.bias
   film_layers.4.net.0.weight
   film_layers.4.net.0.bias
   film_layers.4.net.3.weight
   film_layers.4.net.3.bias
   cond_proj.weight
   cond_proj.bias
Unexpected keys (unused, rare):
   film_layers.0.net.weight
   film_layers.0.net.bias
   film_layers.1.net.weight
   film_layers.1.net.bias
   film_layers.2.

In [None]:
import torch

# Load a unit file
units = torch.load("/mnt/d/LibriTTS-R/unit_embeddings/train-clean-100/4297/13009/4297_13009_000053_000004.pt")
print(units)
print(units.shape)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,3))
plt.plot(units.numpy())
plt.xlabel("Frame Index")
plt.ylabel("Unit ID")
plt.title("Discrete Unit Sequence")
plt.show()


In [None]:
import numpy as np

plt.hist(units.numpy(), bins=100)
plt.title("Unit Distribution")
plt.show()

In [None]:
from models.ecapa import ECAPA

ecapa_tdnn = ECAPA()

embeddings = ecapa_tdnn.extract_speaker_embeddings(f"{DATA_PATH}/{AUDIO_SAMPLE}")
print(embeddings)

In [None]:
embeddings.shape

In [None]:
from models.emotion2vec import Emotion2Vec

emotion2vec = Emotion2Vec()

results = emotion2vec.extract_emotion_embeddings(f"{DATA_PATH}/{AUDIO_SAMPLE}")

print(results)