## Imports

In [None]:
import os
import glob
import numpy as np
import librosa
import joblib
import matplotlib.pyplot as plt
import python_speech_features as psf
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from tqdm import tqdm

## Configuration

In [None]:
# ==========================================
# 1. CONFIGURATION
# ==========================================
SPEAKERS_ROOT = "./data/speakers/"  # Path to the folder containing subfolders for each friend
MODELS_DIR = "trained_models_speakers/" # New folder for speaker models

if not os.path.exists(MODELS_DIR): os.makedirs(MODELS_DIR)

# MFCC Parameters
NUM_CEP = 13
WIN_FUNC = np.hamming
N_COMPONENTS_LIST = [4, 8, 16, 32, 64, 128, 256, 512]

## File gathering

In [None]:
# ==========================================
# 2. FILE GATHERING (The Logic Update)
# ==========================================
def get_speaker_train_files(speaker_path):
    """
    Goes into Friend folder -> Any Language Folder -> 'train' folder
    And collects all audio files.
    """
    # Pattern: data/speakers/Friend1/*/train/*.flac (or .wav)
    # The '*' matches any language folder name
    wav_files = glob.glob(os.path.join(speaker_path, "*", "train", "*.wav"))
    flac_files = glob.glob(os.path.join(speaker_path, "*", "train", "*.flac"))

    return wav_files + flac_files

## Feature extraction

In [None]:
# ==========================================
# 3. FEATURE EXTRACTION FUNCTION
# ==========================================
def extract_features_from_files(files):
    if not files:
        return None

    all_features = []

    for file_path in tqdm(files, leave=False):
        try:
            signal, sr = librosa.load(file_path, sr=None)
            if len(signal.shape) > 1: signal = signal[0]

            # MFCC
            n_fft = 1024 if sr <= 16000 else 2048
            mfccs = psf.mfcc(signal, sr, numcep=NUM_CEP, nfft=n_fft, winfunc=WIN_FUNC, appendEnergy=False)
            if len(mfccs) < 10: continue

            # Hybrid Silence Removal
            energies = np.sum(np.square(mfccs), axis=1).reshape(-1, 1)

            # Fast Init
            kmeans = KMeans(n_clusters=2, n_init='auto', random_state=0).fit(energies)
            gmm_sil = GaussianMixture(n_components=2, random_state=42).fit(energies)

            threshold = (np.min(gmm_sil.means_) + np.min(kmeans.cluster_centers_)) / 2
            is_speech = (energies > threshold).flatten()

            clean_mfcc = mfccs[is_speech]
            if len(clean_mfcc) < 10: continue

            # Deltas
            deltas = psf.delta(clean_mfcc, 2)
            features = np.hstack((clean_mfcc, deltas))
            all_features.append(features)

        except Exception as e:
            print(f"Error reading {os.path.basename(file_path)}: {e}")

    if all_features:
        return np.vstack(all_features)
    return None

## Training

In [None]:
# ==========================================
# 4. TRAINING LOOP
# ==========================================
speaker_results = {}

# Get list of speaker folders
speakers = [d for d in os.listdir(SPEAKERS_ROOT) if os.path.isdir(os.path.join(SPEAKERS_ROOT, d))]

print(f"Found {len(speakers)} speakers: {speakers}")

for speaker in speakers:
    speaker_path = os.path.join(SPEAKERS_ROOT, speaker)
    print(f"\n--- Processing Speaker: {speaker} ---")

    # 1. Gather all 'train' files from all languages
    train_files = get_speaker_train_files(speaker_path)
    print(f" -> Found {len(train_files)} training files (across all languages).")

    # 2. Extract Features
    features = extract_features_from_files(train_files)
    if features is None:
        print(" -> Not enough audio data. Skipping.")
        continue

    print(f" -> Total Frames: {len(features)}")

    # 3. Train GMMs & Study Complexity
    bic_scores = []
    models = {}

    for n in N_COMPONENTS_LIST:
        if len(features) < n * 2:
            bic_scores.append(np.nan)
            continue

        # print(f"Training n={n}...", end="\r")
        gmm = GaussianMixture(n_components=n, covariance_type='diag', max_iter=100, random_state=42)
        gmm.fit(features)

        score = gmm.bic(features)
        bic_scores.append(score)
        models[n] = gmm

    # 4. Select Best Model
    valid_scores = [s for s in bic_scores if not np.isnan(s)]
    valid_n = [N_COMPONENTS_LIST[i] for i, s in enumerate(bic_scores) if not np.isnan(s)]

    best_idx = np.argmin(valid_scores)
    best_n = valid_n[best_idx]
    best_model = models[best_n]

    # Store for plotting
    speaker_results[speaker] = {"n": valid_n, "bic": valid_scores, "best": best_n}

    # 5. Save
    save_path = os.path.join(MODELS_DIR, f"GMM_Speaker_{speaker}_best.joblib")
    joblib.dump(best_model, save_path)
    print(f" -> Saved Best Model (n={best_n}) to {save_path}")

## Results

In [None]:
# ==========================================
# 4. PLOT BIC SCORES
# ==========================================
plt.figure(figsize=(10, 6))
for spk, data in speaker_results.items():
    plt.plot(data["n"], data["bic"], marker='o', label=f"{spk} (Best: {data['best']})")

plt.title("Speaker Model Complexity Study (BIC)")
plt.xlabel("Number of Gaussians")
plt.ylabel("BIC Score (Lower is Better)")
plt.xscale('log')
plt.xticks(N_COMPONENTS_LIST, labels=N_COMPONENTS_LIST)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()