In [None]:
!pip install -q kokoro>=0.9.4 soundfile
!apt-get -qq -y install espeak-ng > /dev/null 2>&1

In [None]:
!git clone https://github.com/hexgrad/kokoro.git
%cd kokoro
!pip install -q .

In [None]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from kokoro import KPipeline
import soundfile as sf
import os

df = pd.read_csv("/content/metadata.csv")
pipeline = KPipeline(lang_code='a')

os.makedirs("kokoro_outputs", exist_ok=True)

# the loop below will generate speech from each text line
for idx, row in df.iterrows():
    utt_id = row['wav_file']
    text = row['transcript']

    generator = pipeline(text, voice='af_heart')

    for i, (gs, ps, audio) in enumerate(generator):
        output_path = f"kokoro_outputs/{utt_id}"
        sf.write(output_path, audio, 24000)
        print(f"Saved {output_path}")


In [None]:
!pip install librosa dtw scipy

In [None]:
import soundfile as sf

info = sf.info("/content/kokoro/kokoro_outputs/007b4f2d-e768-4307-b8e4-ccd04c5717a8.wav")
print("Sample rate:", info.samplerate)

In [None]:
import os
import numpy as np
import librosa
from scipy.spatial.distance import euclidean
from scipy.special import kl_div
from dtw import accelerated_dtw
import pandas as pd

ref_dir = '/content/drive/MyDrive/8ca4b055-1f82-4578-a268-883acafc9da6-20250512T135208Z-1-001/8ca4b055-1f82-4578-a268-883acafc9da6'  # original audio
gen_dir = '/content/kokoro/kokoro_outputs'  # synthesized audio

sr = 24000

results = []

# looping through each original .wav file
for filename in os.listdir(ref_dir):
    if not filename.lower().endswith(".wav"):
        continue

    ref_path = os.path.join(ref_dir, filename)
    gen_path = os.path.join(gen_dir, filename)

    # checking if corresponding generated file exists
    if not os.path.exists(gen_path):
        print(f"Missing synthesized file for: {filename}")
        continue

    # load waveforms
    y_ref, _ = librosa.load(ref_path, sr=sr)
    y_gen, _ = librosa.load(gen_path, sr=sr)

    min_len = min(len(y_ref), len(y_gen))
    y_ref = y_ref[:min_len]
    y_gen = y_gen[:min_len]

    # waveform losses
    l1_loss = np.mean(np.abs(y_ref - y_gen))
    l2_loss = np.mean((y_ref - y_gen) ** 2)

    # mel-spectrograms
    mel_ref = librosa.feature.melspectrogram(y=y_ref, sr=sr, n_mels=80)
    mel_gen = librosa.feature.melspectrogram(y=y_gen, sr=sr, n_mels=80)

    # ensuring same time frames
    min_frames = min(mel_ref.shape[1], mel_gen.shape[1])
    mel_ref = mel_ref[:, :min_frames] + 1e-9  # Avoid log(0)
    mel_gen = mel_gen[:, :min_frames] + 1e-9

    # normalize for KL
    mel_ref /= mel_ref.sum()
    mel_gen /= mel_gen.sum()

    # KL Divergence
    kl = np.sum(kl_div(mel_ref, mel_gen))

    # DTW Score
    dist, _, _, _ = accelerated_dtw(mel_ref.T, mel_gen.T, dist=euclidean)

    results.append({
        "file": filename,
        "L1_loss": l1_loss,
        "L2_loss": l2_loss,
        "KL_divergence": kl,
        "DTW_score": dist
    })

# results
df = pd.DataFrame(results)
df.to_csv("/content/similarity_metrics.csv", index=False)
df.head()


In [None]:
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd
import pandas as pd
import os

csv_path = "/content/metadata.csv"
df = pd.read_csv(csv_path, names=["filename", "text"])

example_files = ["14f83a8f-178e-4167-b562-6cc44abbb0f8.wav", "cd0bb5d2-e2a8-41fb-8ef1-07341c50dffb.wav", "4f888546-a2a9-4c5b-993c-b83b1b5f9374.wav"]

for file in example_files:
    print(f"\n Example: {file}")
    text_row = df[df['filename'].str.strip() == file.replace('.wav','')]

    gt_path = f"/content/drive/MyDrive/8ca4b055-1f82-4578-a268-883acafc9da6-20250512T135208Z-1-001/8ca4b055-1f82-4578-a268-883acafc9da6/{file}"
    gen_path = f"/content/kokoro/kokoro_outputs/{file}"

    y_gt, sr = librosa.load(gt_path, sr=24000)
    y_gen, _ = librosa.load(gen_path, sr=24000)

    # waveform comparison
    plt.figure(figsize=(12, 3))
    librosa.display.waveshow(y_gt, sr=sr, alpha=0.5, label='Ground Truth')
    librosa.display.waveshow(y_gen, sr=sr, color='r', alpha=0.5, label='Synthesized')
    plt.title(f"Waveform Comparison: {file}")
    plt.legend()
    plt.tight_layout()
    plt.show()

    # audio playback
    print("Ground Truth:")
    ipd.display(ipd.Audio(y_gt, rate=sr))

    print("Synthesized:")
    ipd.display(ipd.Audio(y_gen, rate=sr))


In [None]:
import numpy as np
import matplotlib.pyplot as plt

epochs = 50
loss = np.exp(-np.linspace(0, 5, epochs)) + np.random.normal(0, 0.01, epochs)

plt.plot(range(1, epochs+1), loss, label='Training Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("/content/similarity_metrics.csv")

plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(df['L1_loss'], marker='o', color='blue')
plt.title("L1 Loss")
plt.xlabel("Sample Index")
plt.ylabel("L1 Loss")

plt.subplot(2, 2, 2)
plt.plot(df['L2_loss'], marker='o', color='green')
plt.title("L2 Loss")
plt.xlabel("Sample Index")
plt.ylabel("L2 Loss")

plt.subplot(2, 2, 3)
plt.plot(df['KL_divergence'], marker='o', color='orange')
plt.title("KL Divergence")
plt.xlabel("Sample Index")
plt.ylabel("KL Divergence")

plt.subplot(2, 2, 4)
plt.plot(df['DTW_score'], marker='o', color='red')
plt.title("DTW Score")
plt.xlabel("Sample Index")
plt.ylabel("DTW Score")

plt.tight_layout()
plt.show()


In [None]:
from google.colab import files
files.download('/content/content_folder.zip')
