# **INIT LIBRARY**

In [1]:
# =========================
# 0) Init Library (Baseline - Official)
# =========================

import os, sys, re, json, math, random, warnings
from pathlib import Path
from dataclasses import dataclass
from typing import Optional, Dict, Any, List, Tuple

import numpy as np
import pandas as pd

# (opsional) progress bar
from tqdm.auto import tqdm

# (opsional) ML util dasar
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# Torch optional (kalau nanti SSL / deep regressor)
try:
    import torch
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    torch.manual_seed(SEED)
    if DEVICE == "cuda":
        torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
except Exception:
    torch = None
    DEVICE = "cpu"

print("Init OK")
print("SEED  :", SEED)
print("DEVICE:", DEVICE)
print("pandas:", pd.__version__)
print("numpy :", np.__version__)
print("sklearn:", __import__("sklearn").__version__)
if torch is not None:
    print("torch :", torch.__version__)


  from .autonotebook import tqdm as notebook_tqdm


Init OK
SEED  : 42
DEVICE: cpu
pandas: 2.2.1
numpy : 1.26.4
sklearn: 1.4.1.post1
torch : 2.7.0+cpu


# **SETUP PATH + LOAD METADATA (EXCLUDE VAD DROP)**

In [2]:
# =========================
# 1) Setup path + Load metadata (official) + Exclude VAD drop
# =========================

# --- auto-detect ROOT (cari folder yang punya preprocessing/meta_*.csv)
CWD = Path.cwd().resolve()
ROOT = None
PREP_DIR = None

for p in [CWD] + list(CWD.parents):
    # kemungkinan struktur: <root>/preprocessing/...
    if (p / "preprocessing").exists() and (p / "preprocessing" / "meta_train_official.csv").exists():
        ROOT = p
        PREP_DIR = p / "preprocessing"
        break
    # kemungkinan struktur: <root>/output/preprocessing/...
    if (p / "output" / "preprocessing").exists() and (p / "output" / "preprocessing" / "meta_train_official.csv").exists():
        ROOT = p
        PREP_DIR = p / "output" / "preprocessing"
        break

if ROOT is None or PREP_DIR is None:
    raise FileNotFoundError(
        "Gagal nemu folder preprocessing. Pastikan ada file preprocessing/meta_train_official.csv "
        "atau output/preprocessing/meta_train_official.csv di salah satu parent folder."
    )

print("ROOT    :", ROOT)
print("PREP_DIR:", PREP_DIR)

# --- load metadata official
train_path = PREP_DIR / "meta_train_official.csv"
val_path   = PREP_DIR / "meta_val_official.csv"
test_path  = PREP_DIR / "meta_test_official.csv"

df_train = pd.read_csv(train_path)
df_val   = pd.read_csv(val_path)
df_test  = pd.read_csv(test_path)

print("\n[Loaded official metadata]")
print("Train:", df_train.shape, "| Val:", df_val.shape, "| Test:", df_test.shape)


# load vad drop 
VAD_DROP = ROOT / "output" / "preprocessing" / "vad" / "vad_drop.csv"

def exclude_vad_drop(df: pd.DataFrame, vad_drop_path: Path) -> pd.DataFrame:
    drop_path = pd.read_csv(vad_drop_path)
    initial_len = len(df)
    df = df[~df["clip_id"].isin(drop_path["clip_id"].values)].reset_index(drop=True)
    final_len = len(df)
    print(f"Excluded {initial_len - final_len} samples due to VAD drop.")
    print(f"New shape: {df.shape}")
    print(f"Size before drop: {initial_len}, size after drop: {final_len}\n")
    return df

df_train = exclude_vad_drop(df_train, VAD_DROP)
df_val   = exclude_vad_drop(df_val, VAD_DROP)
df_test  = exclude_vad_drop(df_test, VAD_DROP)




ROOT    : E:\tugas-akhir-qiqi
PREP_DIR: E:\tugas-akhir-qiqi\output\preprocessing

[Loaded official metadata]
Train: (6000, 13) | Val: (2000, 13) | Test: (2000, 13)
Excluded 12 samples due to VAD drop.
New shape: (5988, 13)
Size before drop: 6000, size after drop: 5988

Excluded 6 samples due to VAD drop.
New shape: (1994, 13)
Size before drop: 2000, size after drop: 1994

Excluded 8 samples due to VAD drop.
New shape: (1992, 13)
Size before drop: 2000, size after drop: 1992



# **DOWNLOAD & SETUP BACKBONE**

In [3]:
# =========================
# 2) Setup & Download Backbones (cache ke ROOT/hf_cache, NO tokenizer)
# =========================

import os, sys, subprocess
from pathlib import Path

def _pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)

# --- pastikan ROOT sudah ada dari cell sebelumnya
assert "ROOT" in globals(), "ROOT belum ada. Jalankan cell setup path dulu."

# --- set cache SEBELUM import transformers/hf hub (lebih konsisten)
HF_CACHE = Path(ROOT) / "hf_cache"
HF_CACHE.mkdir(parents=True, exist_ok=True)
(HF_CACHE / "hub").mkdir(parents=True, exist_ok=True)

os.environ["HF_HOME"] = str(HF_CACHE)
os.environ["HF_HUB_CACHE"] = str(HF_CACHE / "hub")
os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE)

print("HF_CACHE:", HF_CACHE)

# --- dependencies minimal
try:
    import transformers
except Exception:
    _pip_install(["transformers>=4.40.0", "accelerate", "huggingface_hub", "safetensors"])
    import transformers

# protobuf/sentencepiece (kadang dibutuhkan internal)
try:
    import google.protobuf  # type: ignore
except Exception:
    _pip_install(["protobuf", "sentencepiece"])

# audio IO
try:
    import torchaudio
except Exception:
    _pip_install(["torchaudio", "soundfile"])

import torch
from transformers import AutoFeatureExtractor, AutoModel

# --- model ids (SSL backbone, bukan ASR CTC)
BACKBONES = {
    "wav2vec2": "facebook/wav2vec2-base",
    "hubert"  : "facebook/hubert-base-ls960",
    "wavlm"   : "microsoft/wavlm-base-plus",
}

dtype  = torch.float16 if torch.cuda.is_available() else torch.float32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

feature_extractors = {}
models = {}

for key, model_id in BACKBONES.items():
    print(f"\nLoading {key} -> {model_id}")

    # PAKSA cache masuk ke HF_CACHE, meski sebelumnya ada cache default
    fe = AutoFeatureExtractor.from_pretrained(model_id, cache_dir=str(HF_CACHE))
    model = AutoModel.from_pretrained(model_id, cache_dir=str(HF_CACHE), torch_dtype=dtype)

    model.to(DEVICE)
    model.eval()
    for p in model.parameters():
        p.requires_grad = False  # frozen backbone (baseline)

    feature_extractors[key] = fe
    models[key] = model

    hidden = getattr(model.config, "hidden_size", None)
    layers = getattr(model.config, "num_hidden_layers", None)
    print(f"  device={DEVICE} dtype={dtype}")
    print(f"  hidden_size={hidden} num_layers={layers}")

print("\nDone. Keys:", list(models.keys()))
print("Cache hub dir exists:", (HF_CACHE / "hub").exists())


HF_CACHE: E:\tugas-akhir-qiqi\hf_cache

Loading wav2vec2 -> facebook/wav2vec2-base
  device=cpu dtype=torch.float32
  hidden_size=768 num_layers=12

Loading hubert -> facebook/hubert-base-ls960
  device=cpu dtype=torch.float32
  hidden_size=768 num_layers=12

Loading wavlm -> microsoft/wavlm-base-plus
  device=cpu dtype=torch.float32
  hidden_size=768 num_layers=12

Done. Keys: ['wav2vec2', 'hubert', 'wavlm']
Cache hub dir exists: True


In [4]:
import soundfile as sf


# folder audio hasil preprocessing kamu (sesuaikan kalau beda)
AUDIO_DIR = ROOT / "output" / "preprocessing" / "preprocessed_full"  
assert AUDIO_DIR.exists(), f"AUDIO_DIR tidak ditemukan: {AUDIO_DIR}"

def load_wav_16k(path: Path, target_sr=16000, max_sec=15.0):
    wav, sr = sf.read(str(path))
    if wav.ndim > 1:  # stereo -> mono
        wav = wav.mean(axis=1)
    # kalau sr != 16k, lebih aman kamu pastikan preprocessing memang sudah 16k
    if sr != target_sr:
        raise ValueError(f"SR bukan {target_sr}. Dapat {sr} untuk {path.name}. Pastikan preprocessing sudah 16k.")
    max_len = int(target_sr * max_sec)
    if len(wav) > max_len:
        wav = wav[:max_len]
    return wav.astype(np.float32)

@torch.no_grad()
def extract_embedding(backbone_key: str, wav_path: Path):
    fe = feature_extractors[backbone_key]
    model = models[backbone_key]

    wav = load_wav_16k(wav_path)
    inputs = fe(wav, sampling_rate=16000, return_tensors="pt", padding=True)

    input_values = inputs["input_values"].to(DEVICE)
    attn_mask = inputs.get("attention_mask", None)
    if attn_mask is not None:
        attn_mask = attn_mask.to(DEVICE)

    # forward
    out = model(input_values, attention_mask=attn_mask)
    hs = out.last_hidden_state  # (B, T', H)

    # pakai masked pooling hanya kalau mask-nya "frame-level" (panjangnya sama dengan T')
    if attn_mask is not None and attn_mask.ndim == 2 and attn_mask.shape[1] == hs.shape[1]:
        mask = attn_mask.unsqueeze(-1).type_as(hs)  # (B, T', 1)
        emb = (hs * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
    else:
        emb = hs.mean(dim=1)  # fallback aman untuk semua (termasuk WavLM)

    return emb.squeeze(0).detach().cpu().numpy()  # (H,)



# **SMOKE TEST 3 EXAMPLE**

In [None]:
# =========================
# 3) Smoke test: extract embedding (3 samples) - official train
# =========================

# ambil 3 sample dari train
sample_ids = df_train["clip_id"].astype(str).head(3).tolist()
print("Sample clip_id:", sample_ids)

for cid in sample_ids:
    wav_path = AUDIO_DIR / f"{cid}.wav"
    assert wav_path.exists(), f"File tidak ada: {wav_path}"
    vec = extract_embedding("hubert", wav_path)  # ganti: "wav2vec2" / "wavlm"
    print(cid, "-> embedding shape:", vec.shape, "| nan?", np.isnan(vec).any())


Sample clip_id: ['--Ymqszjv54.001', '--Ymqszjv54.003', '--Ymqszjv54.004']
--Ymqszjv54.001 -> embedding shape: (768,) | nan? False
--Ymqszjv54.003 -> embedding shape: (768,) | nan? False
--Ymqszjv54.004 -> embedding shape: (768,) | nan? False


In [9]:
for cid in sample_ids:
    wav_path = AUDIO_DIR / f"{cid}.wav"
    assert wav_path.exists(), f"File tidak ada: {wav_path}"
    vec = extract_embedding("wav2vec2", wav_path) 
    print(cid, "-> embedding shape:", vec.shape, "| nan?", np.isnan(vec).any())


--Ymqszjv54.001 -> embedding shape: (768,) | nan? False
--Ymqszjv54.003 -> embedding shape: (768,) | nan? False
--Ymqszjv54.004 -> embedding shape: (768,) | nan? False


In [10]:
for cid in sample_ids:
    wav_path = AUDIO_DIR / f"{cid}.wav"
    assert wav_path.exists(), f"File tidak ada: {wav_path}"
    vec = extract_embedding("wavlm", wav_path) 
    print(cid, "-> embedding shape:", vec.shape, "| nan?", np.isnan(vec).any())


--Ymqszjv54.001 -> embedding shape: (768,) | nan? False
--Ymqszjv54.003 -> embedding shape: (768,) | nan? False
--Ymqszjv54.004 -> embedding shape: (768,) | nan? False


# **RUN EMBEDDING HUBERT**

In [11]:
BACKBONE_KEY = "hubert"


AUDIO_DIR =  ROOT / "output" / "preprocessing" / "preprocessed_full"
if AUDIO_DIR is None:
    raise FileNotFoundError(f"Folder preprocessed_full tidak ditemukan")
print("AUDIO_DIR:", AUDIO_DIR)

# output cache
EMB_DIR = ROOT / "output" / "baseline_official" / "embeddings" / BACKBONE_KEY
EMB_DIR.mkdir(parents=True, exist_ok=True)
print("EMB_DIR:", EMB_DIR)

H = int(getattr(models[BACKBONE_KEY].config, "hidden_size", 768))
print("Hidden size:", H)

def _wav_path_from_clip_id(cid: str) -> Path:
    return AUDIO_DIR / f"{cid}.wav"

def extract_split_embeddings(df: pd.DataFrame, split_name: str):
    out_emb_path = EMB_DIR / f"{split_name}_emb.npy"
    out_id_path  = EMB_DIR / f"{split_name}_clip_id.csv"

    # kalau sudah ada cache, skip biar reproducible & hemat waktu
    if out_emb_path.exists() and out_id_path.exists():
        emb = np.load(out_emb_path, mmap_mode="r")
        ids = pd.read_csv(out_id_path)["clip_id"].astype(str).tolist()
        print(f"[SKIP] {split_name}: cache ditemukan -> {emb.shape}")
        return emb, ids

    ids = df["clip_id"].astype(str).tolist()
    N = len(ids)
    emb_mat = np.zeros((N, H), dtype=np.float32)

    missing = 0
    for i, cid in enumerate(tqdm(ids, desc=f"Extract {BACKBONE_KEY} | {split_name}", total=N)):
        wp = _wav_path_from_clip_id(cid)
        if not wp.exists():
            missing += 1
            continue
        vec = extract_embedding(BACKBONE_KEY, wp)  # pakai fungsi extract_embedding versi aman (ada fallback mean)
        emb_mat[i] = vec.astype(np.float32)

    np.save(out_emb_path, emb_mat)
    pd.DataFrame({"clip_id": ids}).to_csv(out_id_path, index=False)

    print(f"[DONE] {split_name}: saved {emb_mat.shape} | missing_audio={missing}")
    return emb_mat, ids

# run untuk official split (setelah VAD drop)
X_train, train_ids = extract_split_embeddings(df_train, "train")
X_val,   val_ids   = extract_split_embeddings(df_val,   "val")
X_test,  test_ids  = extract_split_embeddings(df_test,  "test")


AUDIO_DIR: E:\tugas-akhir-qiqi\output\preprocessing\preprocessed_full
EMB_DIR: E:\tugas-akhir-qiqi\output\baseline_official\embeddings\hubert
Hidden size: 768


Extract hubert | train:   0%|          | 0/5988 [00:00<?, ?it/s]

Extract hubert | train: 100%|██████████| 5988/5988 [4:01:51<00:00,  2.42s/it]  


[DONE] train: saved (5988, 768) | missing_audio=0


Extract hubert | val: 100%|██████████| 1994/1994 [1:18:19<00:00,  2.36s/it]


[DONE] val: saved (1994, 768) | missing_audio=0


Extract hubert | test: 100%|██████████| 1992/1992 [1:18:37<00:00,  2.37s/it]

[DONE] test: saved (1992, 768) | missing_audio=0





# **RUN EMBEDDING WAV2VEC2**

In [5]:
BACKBONE_KEY = "wav2vec2"


AUDIO_DIR =  ROOT / "output" / "preprocessing" / "preprocessed_full"
if AUDIO_DIR is None:
    raise FileNotFoundError(f"Folder preprocessed_full tidak ditemukan")
print("AUDIO_DIR:", AUDIO_DIR)

# output cache
EMB_DIR = ROOT / "output" / "baseline_official" / "embeddings" / BACKBONE_KEY
EMB_DIR.mkdir(parents=True, exist_ok=True)
print("EMB_DIR:", EMB_DIR)

H = int(getattr(models[BACKBONE_KEY].config, "hidden_size", 768))
print("Hidden size:", H)

def _wav_path_from_clip_id(cid: str) -> Path:
    return AUDIO_DIR / f"{cid}.wav"

def extract_split_embeddings(df: pd.DataFrame, split_name: str):
    out_emb_path = EMB_DIR / f"{split_name}_emb.npy"
    out_id_path  = EMB_DIR / f"{split_name}_clip_id.csv"

    # kalau sudah ada cache, skip biar reproducible & hemat waktu
    if out_emb_path.exists() and out_id_path.exists():
        emb = np.load(out_emb_path, mmap_mode="r")
        ids = pd.read_csv(out_id_path)["clip_id"].astype(str).tolist()
        print(f"[SKIP] {split_name}: cache ditemukan -> {emb.shape}")
        return emb, ids

    ids = df["clip_id"].astype(str).tolist()
    N = len(ids)
    emb_mat = np.zeros((N, H), dtype=np.float32)

    missing = 0
    for i, cid in enumerate(tqdm(ids, desc=f"Extract {BACKBONE_KEY} | {split_name}", total=N)):
        wp = _wav_path_from_clip_id(cid)
        if not wp.exists():
            missing += 1
            continue
        vec = extract_embedding(BACKBONE_KEY, wp)  # pakai fungsi extract_embedding versi aman (ada fallback mean)
        emb_mat[i] = vec.astype(np.float32)

    np.save(out_emb_path, emb_mat)
    pd.DataFrame({"clip_id": ids}).to_csv(out_id_path, index=False)

    print(f"[DONE] {split_name}: saved {emb_mat.shape} | missing_audio={missing}")
    return emb_mat, ids

# run untuk official split (setelah VAD drop)
X_train, train_ids = extract_split_embeddings(df_train, "train")
X_val,   val_ids   = extract_split_embeddings(df_val,   "val")
X_test,  test_ids  = extract_split_embeddings(df_test,  "test")


AUDIO_DIR: E:\tugas-akhir-qiqi\output\preprocessing\preprocessed_full
EMB_DIR: E:\tugas-akhir-qiqi\output\baseline_official\embeddings\wav2vec2
Hidden size: 768


Extract wav2vec2 | train: 100%|██████████| 5988/5988 [3:27:10<00:00,  2.08s/it]  


[DONE] train: saved (5988, 768) | missing_audio=0


Extract wav2vec2 | val: 100%|██████████| 1994/1994 [1:14:29<00:00,  2.24s/it]


[DONE] val: saved (1994, 768) | missing_audio=0


Extract wav2vec2 | test: 100%|██████████| 1992/1992 [1:12:20<00:00,  2.18s/it]

[DONE] test: saved (1992, 768) | missing_audio=0





# **RUN EMBEDDING WAVLM**

In [6]:
BACKBONE_KEY = "wavlm"


AUDIO_DIR =  ROOT / "output" / "preprocessing" / "preprocessed_full"
if AUDIO_DIR is None:
    raise FileNotFoundError(f"Folder preprocessed_full tidak ditemukan")
print("AUDIO_DIR:", AUDIO_DIR)

# output cache
EMB_DIR = ROOT / "output" / "baseline_official" / "embeddings" / BACKBONE_KEY
EMB_DIR.mkdir(parents=True, exist_ok=True)
print("EMB_DIR:", EMB_DIR)

H = int(getattr(models[BACKBONE_KEY].config, "hidden_size", 768))
print("Hidden size:", H)

def _wav_path_from_clip_id(cid: str) -> Path:
    return AUDIO_DIR / f"{cid}.wav"

def extract_split_embeddings(df: pd.DataFrame, split_name: str):
    out_emb_path = EMB_DIR / f"{split_name}_emb.npy"
    out_id_path  = EMB_DIR / f"{split_name}_clip_id.csv"

    # kalau sudah ada cache, skip biar reproducible & hemat waktu
    if out_emb_path.exists() and out_id_path.exists():
        emb = np.load(out_emb_path, mmap_mode="r")
        ids = pd.read_csv(out_id_path)["clip_id"].astype(str).tolist()
        print(f"[SKIP] {split_name}: cache ditemukan -> {emb.shape}")
        return emb, ids

    ids = df["clip_id"].astype(str).tolist()
    N = len(ids)
    emb_mat = np.zeros((N, H), dtype=np.float32)

    missing = 0
    for i, cid in enumerate(tqdm(ids, desc=f"Extract {BACKBONE_KEY} | {split_name}", total=N)):
        wp = _wav_path_from_clip_id(cid)
        if not wp.exists():
            missing += 1
            continue
        vec = extract_embedding(BACKBONE_KEY, wp)  # pakai fungsi extract_embedding versi aman (ada fallback mean)
        emb_mat[i] = vec.astype(np.float32)

    np.save(out_emb_path, emb_mat)
    pd.DataFrame({"clip_id": ids}).to_csv(out_id_path, index=False)

    print(f"[DONE] {split_name}: saved {emb_mat.shape} | missing_audio={missing}")
    return emb_mat, ids

# run untuk official split (setelah VAD drop)
X_train, train_ids = extract_split_embeddings(df_train, "train")
X_val,   val_ids   = extract_split_embeddings(df_val,   "val")
X_test,  test_ids  = extract_split_embeddings(df_test,  "test")


AUDIO_DIR: E:\tugas-akhir-qiqi\output\preprocessing\preprocessed_full
EMB_DIR: E:\tugas-akhir-qiqi\output\baseline_official\embeddings\wavlm
Hidden size: 768


Extract wavlm | train: 100%|██████████| 5988/5988 [3:39:24<00:00,  2.20s/it]  


[DONE] train: saved (5988, 768) | missing_audio=0


Extract wavlm | val: 100%|██████████| 1994/1994 [1:13:02<00:00,  2.20s/it]


[DONE] val: saved (1994, 768) | missing_audio=0


Extract wavlm | test: 100%|██████████| 1992/1992 [1:12:22<00:00,  2.18s/it]

[DONE] test: saved (1992, 768) | missing_audio=0





# **TRAIN DAN SAVE OUTPUT HUBERT**

In [5]:
# =========================
# 5) Train Ridge Regressor (baseline) + Eval (official) for 1 backbone
# =========================

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

BACKBONE_KEY = "hubert"  # ganti: "wav2vec2" / "wavlm"

EMB_DIR = ROOT / "output" / "baseline_official" / "embeddings" / BACKBONE_KEY
assert EMB_DIR.exists(), f"EMB_DIR tidak ada: {EMB_DIR}"

# --- load embeddings + ids
def load_split(split):
    X = np.load(EMB_DIR / f"{split}_emb.npy")
    ids = pd.read_csv(EMB_DIR / f"{split}_clip_id.csv")["clip_id"].astype(str).tolist()
    return X, ids

X_train, train_ids = load_split("train")
X_val,   val_ids   = load_split("val")
X_test,  test_ids  = load_split("test")

print("X_train:", X_train.shape, "X_val:", X_val.shape, "X_test:", X_test.shape)

# --- auto-detect label columns (ambil 5 kolom float yang bukan id/path/group)
def detect_label_cols(df):
    exclude = set([c for c in df.columns if any(k in c.lower() for k in ["clip", "id", "path", "file", "group", "split"])])
    num_cols = [c for c in df.columns if (c not in exclude) and pd.api.types.is_numeric_dtype(df[c])]
    # heuristik: label big five biasanya 5 kolom numerik
    if len(num_cols) < 5:
        raise ValueError(f"Kolom numerik kandidat label kurang dari 5: {num_cols}")
    # ambil 5 kolom numerik pertama (atau kamu bisa urutkan sesuai nama)
    return num_cols[:5]

label_cols = detect_label_cols(df_train)
print("Label cols:", label_cols)

y_train = df_train[label_cols].to_numpy(dtype=np.float32)
y_val   = df_val[label_cols].to_numpy(dtype=np.float32)
y_test  = df_test[label_cols].to_numpy(dtype=np.float32)

# --- scale embeddings (fit hanya di train)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# --- ridge multi-output
ridge = Ridge(alpha=1.0, random_state=42)
model = MultiOutputRegressor(ridge)

model.fit(X_train_s, y_train)

# --- predict
pred_val  = model.predict(X_val_s)
pred_test = model.predict(X_test_s)

def metrics(y_true, y_pred, name=""):
    mae  = mean_absolute_error(y_true, y_pred, multioutput="raw_values")  # per trait
    rmse = np.sqrt(mean_squared_error(y_true, y_pred, multioutput="raw_values"))
    r2   = r2_score(y_true, y_pred, multioutput="raw_values")

    acc = 1.0 - mae                       # Acc_j = 1 - MAE_j
    mean_acc = acc.mean()                 # mean accuracy (rata-rata 5 trait)

    dfm = pd.DataFrame({    
        "trait": label_cols,
        "Acc(1-MAE)": acc,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
    })

    dfm.loc["mean"] = ["mean", mean_acc, mae.mean(), rmse.mean(), r2.mean()]

    print(f"\n== {name} ==")
    display(dfm)
    return dfm


m_val  = metrics(y_val,  pred_val,  f"{BACKBONE_KEY} | VAL")
m_test = metrics(y_test, pred_test, f"{BACKBONE_KEY} | TEST")

# --- save outputs
OUT_DIR = ROOT / "output" / "baseline_official" / "results" / BACKBONE_KEY
OUT_DIR.mkdir(parents=True, exist_ok=True)

# save preds
pd.DataFrame({"clip_id": val_ids, **{f"pred_{c}": pred_val[:,i] for i,c in enumerate(label_cols)}})\
  .to_csv(OUT_DIR / "pred_val.csv", index=False)
pd.DataFrame({"clip_id": test_ids, **{f"pred_{c}": pred_test[:,i] for i,c in enumerate(label_cols)}})\
  .to_csv(OUT_DIR / "pred_test.csv", index=False)

# save metrics
m_val.to_csv(OUT_DIR / "metrics_val.csv", index=False)
m_test.to_csv(OUT_DIR / "metrics_test.csv", index=False)

# save scaler + model (joblib)
import joblib
joblib.dump(scaler, OUT_DIR / "scaler.joblib")
joblib.dump(model, OUT_DIR / "ridge_multioutput.joblib")

print("\nSaved to:", OUT_DIR)


X_train: (5988, 768) X_val: (1994, 768) X_test: (1992, 768)
Label cols: ['Ethnicity', 'Gender', 'extraversion', 'neuroticism', 'agreeableness']

== hubert | VAL ==


Unnamed: 0,trait,Acc(1-MAE),MAE,RMSE,R2
0,Ethnicity,0.774493,0.225507,0.327159,0.231725
1,Gender,0.878411,0.121589,0.170028,0.883594
2,extraversion,0.898941,0.101059,0.125548,0.278494
3,neuroticism,0.900084,0.099916,0.126647,0.281676
4,agreeableness,0.905943,0.094057,0.119798,0.11334
mean,mean,0.871574,0.128426,0.173836,0.357766



== hubert | TEST ==


Unnamed: 0,trait,Acc(1-MAE),MAE,RMSE,R2
0,Ethnicity,0.773315,0.226685,0.328772,0.157689
1,Gender,0.875621,0.124379,0.176463,0.873906
2,extraversion,0.899344,0.100656,0.126845,0.280039
3,neuroticism,0.897512,0.102488,0.128737,0.289691
4,agreeableness,0.901104,0.098896,0.12466,0.118799
mean,mean,0.869379,0.130621,0.177095,0.344025



Saved to: E:\tugas-akhir-qiqi\output\baseline_official\results\hubert


# **TRAIN DAN SAVE OUTPUT WAV2VEC2**

In [6]:
# =========================
# 5) Train Ridge Regressor (baseline) + Eval (official) for 1 backbone
# =========================

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

BACKBONE_KEY = "wav2vec2"  # ganti: "wav2vec2" / "wavlm"

EMB_DIR = ROOT / "output" / "baseline_official" / "embeddings" / BACKBONE_KEY
assert EMB_DIR.exists(), f"EMB_DIR tidak ada: {EMB_DIR}"

# --- load embeddings + ids
def load_split(split):
    X = np.load(EMB_DIR / f"{split}_emb.npy")
    ids = pd.read_csv(EMB_DIR / f"{split}_clip_id.csv")["clip_id"].astype(str).tolist()
    return X, ids

X_train, train_ids = load_split("train")
X_val,   val_ids   = load_split("val")
X_test,  test_ids  = load_split("test")

print("X_train:", X_train.shape, "X_val:", X_val.shape, "X_test:", X_test.shape)

# --- auto-detect label columns (ambil 5 kolom float yang bukan id/path/group)
def detect_label_cols(df):
    exclude = set([c for c in df.columns if any(k in c.lower() for k in ["clip", "id", "path", "file", "group", "split"])])
    num_cols = [c for c in df.columns if (c not in exclude) and pd.api.types.is_numeric_dtype(df[c])]
    # heuristik: label big five biasanya 5 kolom numerik
    if len(num_cols) < 5:
        raise ValueError(f"Kolom numerik kandidat label kurang dari 5: {num_cols}")
    # ambil 5 kolom numerik pertama (atau kamu bisa urutkan sesuai nama)
    return num_cols[:5]

label_cols = detect_label_cols(df_train)
print("Label cols:", label_cols)

y_train = df_train[label_cols].to_numpy(dtype=np.float32)
y_val   = df_val[label_cols].to_numpy(dtype=np.float32)
y_test  = df_test[label_cols].to_numpy(dtype=np.float32)

# --- scale embeddings (fit hanya di train)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# --- ridge multi-output
ridge = Ridge(alpha=1.0, random_state=42)
model = MultiOutputRegressor(ridge)

model.fit(X_train_s, y_train)

# --- predict
pred_val  = model.predict(X_val_s)
pred_test = model.predict(X_test_s)

def metrics(y_true, y_pred, name=""):
    mae  = mean_absolute_error(y_true, y_pred, multioutput="raw_values")  # per trait
    rmse = np.sqrt(mean_squared_error(y_true, y_pred, multioutput="raw_values"))
    r2   = r2_score(y_true, y_pred, multioutput="raw_values")

    acc = 1.0 - mae                       # Acc_j = 1 - MAE_j
    mean_acc = acc.mean()                 # mean accuracy (rata-rata 5 trait)

    dfm = pd.DataFrame({    
        "trait": label_cols,
        "Acc(1-MAE)": acc,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
    })

    dfm.loc["mean"] = ["mean", mean_acc, mae.mean(), rmse.mean(), r2.mean()]

    print(f"\n== {name} ==")
    display(dfm)
    return dfm


m_val  = metrics(y_val,  pred_val,  f"{BACKBONE_KEY} | VAL")
m_test = metrics(y_test, pred_test, f"{BACKBONE_KEY} | TEST")

# --- save outputs
OUT_DIR = ROOT / "output" / "baseline_official" / "results" / BACKBONE_KEY
OUT_DIR.mkdir(parents=True, exist_ok=True)

# save preds
pd.DataFrame({"clip_id": val_ids, **{f"pred_{c}": pred_val[:,i] for i,c in enumerate(label_cols)}})\
  .to_csv(OUT_DIR / "pred_val.csv", index=False)
pd.DataFrame({"clip_id": test_ids, **{f"pred_{c}": pred_test[:,i] for i,c in enumerate(label_cols)}})\
  .to_csv(OUT_DIR / "pred_test.csv", index=False)

# save metrics
m_val.to_csv(OUT_DIR / "metrics_val.csv", index=False)
m_test.to_csv(OUT_DIR / "metrics_test.csv", index=False)

# save scaler + model (joblib)
import joblib
joblib.dump(scaler, OUT_DIR / "scaler.joblib")
joblib.dump(model, OUT_DIR / "ridge_multioutput.joblib")

print("\nSaved to:", OUT_DIR)


X_train: (5988, 768) X_val: (1994, 768) X_test: (1992, 768)
Label cols: ['Ethnicity', 'Gender', 'extraversion', 'neuroticism', 'agreeableness']

== wav2vec2 | VAL ==


Unnamed: 0,trait,Acc(1-MAE),MAE,RMSE,R2
0,Ethnicity,0.765696,0.234304,0.336173,0.188803
1,Gender,0.880884,0.119116,0.166747,0.888042
2,extraversion,0.897467,0.102533,0.127816,0.252186
3,neuroticism,0.89983,0.10017,0.125381,0.295957
4,agreeableness,0.905892,0.094108,0.118983,0.125357
mean,mean,0.869954,0.130046,0.17502,0.350069



== wav2vec2 | TEST ==


Unnamed: 0,trait,Acc(1-MAE),MAE,RMSE,R2
0,Ethnicity,0.767426,0.232574,0.331625,0.143002
1,Gender,0.882016,0.117984,0.171382,0.881063
2,extraversion,0.897068,0.102932,0.128505,0.261076
3,neuroticism,0.897785,0.102215,0.129243,0.284096
4,agreeableness,0.899163,0.100837,0.127152,0.083218
mean,mean,0.868692,0.131308,0.177581,0.330491



Saved to: E:\tugas-akhir-qiqi\output\baseline_official\results\wav2vec2


# **TRAIN DAN SAVE OUTPUT WAVLM**

In [7]:
# =========================
# 5) Train Ridge Regressor (baseline) + Eval (official) for 1 backbone
# =========================

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

BACKBONE_KEY = "wavlm"  # ganti: "wav2vec2" / "wavlm"

EMB_DIR = ROOT / "output" / "baseline_official" / "embeddings" / BACKBONE_KEY
assert EMB_DIR.exists(), f"EMB_DIR tidak ada: {EMB_DIR}"

# --- load embeddings + ids
def load_split(split):
    X = np.load(EMB_DIR / f"{split}_emb.npy")
    ids = pd.read_csv(EMB_DIR / f"{split}_clip_id.csv")["clip_id"].astype(str).tolist()
    return X, ids

X_train, train_ids = load_split("train")
X_val,   val_ids   = load_split("val")
X_test,  test_ids  = load_split("test")

print("X_train:", X_train.shape, "X_val:", X_val.shape, "X_test:", X_test.shape)

# --- auto-detect label columns (ambil 5 kolom float yang bukan id/path/group)
def detect_label_cols(df):
    exclude = set([c for c in df.columns if any(k in c.lower() for k in ["clip", "id", "path", "file", "group", "split"])])
    num_cols = [c for c in df.columns if (c not in exclude) and pd.api.types.is_numeric_dtype(df[c])]
    # heuristik: label big five biasanya 5 kolom numerik
    if len(num_cols) < 5:
        raise ValueError(f"Kolom numerik kandidat label kurang dari 5: {num_cols}")
    # ambil 5 kolom numerik pertama (atau kamu bisa urutkan sesuai nama)
    return num_cols[:5]

label_cols = detect_label_cols(df_train)
print("Label cols:", label_cols)

y_train = df_train[label_cols].to_numpy(dtype=np.float32)
y_val   = df_val[label_cols].to_numpy(dtype=np.float32)
y_test  = df_test[label_cols].to_numpy(dtype=np.float32)

# --- scale embeddings (fit hanya di train)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# --- ridge multi-output
ridge = Ridge(alpha=1.0, random_state=42)
model = MultiOutputRegressor(ridge)

model.fit(X_train_s, y_train)

# --- predict
pred_val  = model.predict(X_val_s)
pred_test = model.predict(X_test_s)

def metrics(y_true, y_pred, name=""):
    mae  = mean_absolute_error(y_true, y_pred, multioutput="raw_values")  # per trait
    rmse = np.sqrt(mean_squared_error(y_true, y_pred, multioutput="raw_values"))
    r2   = r2_score(y_true, y_pred, multioutput="raw_values")

    acc = 1.0 - mae                       # Acc_j = 1 - MAE_j
    mean_acc = acc.mean()                 # mean accuracy (rata-rata 5 trait)

    dfm = pd.DataFrame({    
        "trait": label_cols,
        "Acc(1-MAE)": acc,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
    })

    dfm.loc["mean"] = ["mean", mean_acc, mae.mean(), rmse.mean(), r2.mean()]

    print(f"\n== {name} ==")
    display(dfm)
    return dfm


m_val  = metrics(y_val,  pred_val,  f"{BACKBONE_KEY} | VAL")
m_test = metrics(y_test, pred_test, f"{BACKBONE_KEY} | TEST")

# --- save outputs
OUT_DIR = ROOT / "output" / "baseline_official" / "results" / BACKBONE_KEY
OUT_DIR.mkdir(parents=True, exist_ok=True)

# save preds
pd.DataFrame({"clip_id": val_ids, **{f"pred_{c}": pred_val[:,i] for i,c in enumerate(label_cols)}})\
  .to_csv(OUT_DIR / "pred_val.csv", index=False)
pd.DataFrame({"clip_id": test_ids, **{f"pred_{c}": pred_test[:,i] for i,c in enumerate(label_cols)}})\
  .to_csv(OUT_DIR / "pred_test.csv", index=False)

# save metrics
m_val.to_csv(OUT_DIR / "metrics_val.csv", index=False)
m_test.to_csv(OUT_DIR / "metrics_test.csv", index=False)

# save scaler + model (joblib)
import joblib
joblib.dump(scaler, OUT_DIR / "scaler.joblib")
joblib.dump(model, OUT_DIR / "ridge_multioutput.joblib")

print("\nSaved to:", OUT_DIR)


X_train: (5988, 768) X_val: (1994, 768) X_test: (1992, 768)
Label cols: ['Ethnicity', 'Gender', 'extraversion', 'neuroticism', 'agreeableness']

== wavlm | VAL ==


Unnamed: 0,trait,Acc(1-MAE),MAE,RMSE,R2
0,Ethnicity,0.780407,0.219593,0.334,0.199257
1,Gender,0.86769,0.13231,0.177686,0.872871
2,extraversion,0.902618,0.097382,0.122322,0.315092
3,neuroticism,0.901067,0.098933,0.124438,0.306512
4,agreeableness,0.904602,0.095398,0.120902,0.096911
mean,mean,0.871277,0.128723,0.17587,0.358129



== wavlm | TEST ==


Unnamed: 0,trait,Acc(1-MAE),MAE,RMSE,R2
0,Ethnicity,0.783526,0.216474,0.324744,0.178201
1,Gender,0.86637,0.13363,0.181377,0.866784
2,extraversion,0.900109,0.099891,0.124837,0.30266
3,neuroticism,0.899828,0.100172,0.126036,0.319194
4,agreeableness,0.901033,0.098968,0.124347,0.123212
mean,mean,0.870173,0.129827,0.176268,0.35801



Saved to: E:\tugas-akhir-qiqi\output\baseline_official\results\wavlm


# **PERBANDINGAN HASIL 3 EMBEDDING TRANSFORMER (FROZEN)**

In [8]:
# =========================
# 6) Compare results across backbones (VAL & TEST)
# =========================

import pandas as pd
from pathlib import Path

RESULT_ROOT = ROOT / "output" / "baseline_official" / "results"
BACKBONES = ["wav2vec2", "hubert", "wavlm"]

def load_metrics(backbone: str, split: str) -> pd.DataFrame:
    p = RESULT_ROOT / backbone / f"metrics_{split}.csv"
    if not p.exists():
        raise FileNotFoundError(f"File tidak ditemukan: {p}")
    df = pd.read_csv(p)
    # pastikan baris mean ada
    if "trait" in df.columns:
        df["trait"] = df["trait"].astype(str)
    return df

def summarize_mean(df: pd.DataFrame) -> pd.Series:
    # ambil baris mean (yang kamu buat)
    mean_row = df[df["trait"] == "mean"]
    if len(mean_row) == 0:
        # fallback: hitung mean manual dari 5 trait (selain 'mean' jika tidak ada)
        df2 = df[df["trait"] != "mean"].copy()
        return pd.Series({
            "Acc(1-MAE)": df2["Acc(1-MAE)"].mean() if "Acc(1-MAE)" in df2.columns else None,
            "MAE": df2["MAE"].mean(),
            "RMSE": df2["RMSE"].mean(),
            "R2": df2["R2"].mean(),
        })
    mean_row = mean_row.iloc[0]
    return pd.Series({
        "Acc(1-MAE)": mean_row.get("Acc(1-MAE)", None),
        "MAE": mean_row.get("MAE", None),
        "RMSE": mean_row.get("RMSE", None),
        "R2": mean_row.get("R2", None),
    })

rows_val, rows_test = [], []

for b in BACKBONES:
    dfv = load_metrics(b, "val")
    dft = load_metrics(b, "test")

    s_val = summarize_mean(dfv)
    s_test = summarize_mean(dft)

    rows_val.append({"backbone": b, **s_val.to_dict()})
    rows_test.append({"backbone": b, **s_test.to_dict()})

cmp_val = pd.DataFrame(rows_val)
cmp_test = pd.DataFrame(rows_test)

# ranking (lebih tinggi lebih baik untuk Acc & R2, lebih rendah lebih baik untuk MAE & RMSE)
def add_ranks(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if "Acc(1-MAE)" in out.columns and out["Acc(1-MAE)"].notna().any():
        out["rank_Acc"] = out["Acc(1-MAE)"].rank(ascending=False, method="min")
    out["rank_MAE"] = out["MAE"].rank(ascending=True, method="min")
    out["rank_RMSE"] = out["RMSE"].rank(ascending=True, method="min")
    out["rank_R2"] = out["R2"].rank(ascending=False, method="min")
    # rank total sederhana (abaikan rank_Acc kalau kolomnya ga ada)
    rank_cols = [c for c in ["rank_Acc", "rank_MAE", "rank_RMSE", "rank_R2"] if c in out.columns]
    out["rank_total"] = out[rank_cols].mean(axis=1)
    return out.sort_values("rank_total")

cmp_val_ranked = add_ranks(cmp_val)
cmp_test_ranked = add_ranks(cmp_test)

print("=== MEAN METRICS (VAL) ===")
display(cmp_val_ranked)

print("\n=== MEAN METRICS (TEST) ===")
display(cmp_test_ranked)

# Optional: detail per-trait untuk split tertentu (misal VAL)
def per_trait_table(split="val", metric_col="MAE"):
    tables = []
    for b in BACKBONES:
        df = load_metrics(b, split)
        df = df[df["trait"] != "mean"][["trait", metric_col]].copy()
        df = df.rename(columns={metric_col: b})
        tables.append(df.set_index("trait"))
    return pd.concat(tables, axis=1)

print("\n=== Per-trait MAE (VAL) ===")
display(per_trait_table("val", "MAE"))

print("\n=== Per-trait R2 (VAL) ===")
display(per_trait_table("val", "R2"))


=== MEAN METRICS (VAL) ===


Unnamed: 0,backbone,Acc(1-MAE),MAE,RMSE,R2,rank_Acc,rank_MAE,rank_RMSE,rank_R2,rank_total
1,hubert,0.871574,0.128426,0.173836,0.357766,1.0,1.0,1.0,2.0,1.25
2,wavlm,0.871277,0.128723,0.17587,0.358129,2.0,2.0,3.0,1.0,2.0
0,wav2vec2,0.869954,0.130046,0.17502,0.350069,3.0,3.0,2.0,3.0,2.75



=== MEAN METRICS (TEST) ===


Unnamed: 0,backbone,Acc(1-MAE),MAE,RMSE,R2,rank_Acc,rank_MAE,rank_RMSE,rank_R2,rank_total
2,wavlm,0.870173,0.129827,0.176268,0.35801,1.0,1.0,1.0,1.0,1.0
1,hubert,0.869379,0.130621,0.177095,0.344025,2.0,2.0,2.0,2.0,2.0
0,wav2vec2,0.868692,0.131308,0.177581,0.330491,3.0,3.0,3.0,3.0,3.0



=== Per-trait MAE (VAL) ===


Unnamed: 0_level_0,wav2vec2,hubert,wavlm
trait,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ethnicity,0.234304,0.225507,0.219593
Gender,0.119116,0.121589,0.13231
extraversion,0.102533,0.101059,0.097382
neuroticism,0.10017,0.099916,0.098933
agreeableness,0.094108,0.094057,0.095398



=== Per-trait R2 (VAL) ===


Unnamed: 0_level_0,wav2vec2,hubert,wavlm
trait,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ethnicity,0.188803,0.231725,0.199257
Gender,0.888042,0.883594,0.872871
extraversion,0.252186,0.278494,0.315092
neuroticism,0.295957,0.281676,0.306512
agreeableness,0.125357,0.11334,0.096911


# **SET UP EKSTRAKSI FITUR eGeMAPS**

In [9]:
# =========================
# eGeMAPS Feature Extraction (Official) + Cache/Resume
# =========================

import os, sys, subprocess
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import opensmile
import soundfile as sf

assert "ROOT" in globals(), "ROOT belum ada"
assert "df_train" in globals() and "df_val" in globals() and "df_test" in globals(), "df_train/df_val/df_test belum ada"


# --- audio dir (samain dengan yang kamu pakai buat embedding)
AUDIO_DIR = ROOT / "output" / "preprocessing" / "preprocessed_full"
if not AUDIO_DIR.exists():
    raise FileNotFoundError(f"Folder audio tidak ditemukan: {AUDIO_DIR}")
print("AUDIO_DIR:", AUDIO_DIR)

# --- output dir eGeMAPS
EG_DIR = ROOT / "output" / "baseline_official" / "egemaps"
EG_DIR.mkdir(parents=True, exist_ok=True)
print("EG_DIR:", EG_DIR)

# --- init extractor eGeMAPS v02 (functionals)
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

def _wav_path(cid: str) -> Path:
    return AUDIO_DIR / f"{cid}.wav"

def extract_egemaps_split(df: pd.DataFrame, split_name: str, save_every: int = 200):
    """
    Extract eGeMAPS functionals per file.
    Save to CSV with resume support.
    """
    out_path = EG_DIR / f"{split_name}_egemaps.csv"

    # resume: kalau file sudah ada, lanjut dari clip_id yang belum
    done_ids = set()
    if out_path.exists():
        old = pd.read_csv(out_path)
        if "clip_id" in old.columns:
            done_ids = set(old["clip_id"].astype(str).tolist())
        print(f"[RESUME] {split_name}: found {len(done_ids)} already extracted in {out_path.name}")

    ids_all = df["clip_id"].astype(str).tolist()
    ids_todo = [cid for cid in ids_all if cid not in done_ids]
    print(f"[TODO] {split_name}: {len(ids_todo)} / {len(ids_all)} remaining")

    rows = []
    missing = 0

    def _flush(rows_buffer):
        if not rows_buffer:
            return
        new_df = pd.concat(rows_buffer, axis=0, ignore_index=True)
        if out_path.exists():
            new_df.to_csv(out_path, mode="a", header=False, index=False)
        else:
            new_df.to_csv(out_path, index=False)
        rows_buffer.clear()

    for i, cid in enumerate(tqdm(ids_todo, desc=f"eGeMAPS | {split_name}", total=len(ids_todo))):
        wp = _wav_path(cid)
        if not wp.exists():
            missing += 1
            continue

        # process 1 file -> dataframe 1 row (functionals)
        feat = smile.process_file(str(wp))
        feat = feat.reset_index(drop=True)
        feat.insert(0, "clip_id", cid)  # taruh clip_id di depan

        rows.append(feat)

        # save bertahap biar aman kalau notebook ke-stop
        if (i + 1) % save_every == 0:
            _flush(rows)

    _flush(rows)
    print(f"[DONE] {split_name}: saved -> {out_path} | missing_audio={missing}")

    # load final untuk dipakai langsung
    df_feat = pd.read_csv(out_path)
    print(f"[LOAD] {split_name}: {df_feat.shape}")
    return df_feat

eg_train = extract_egemaps_split(df_train, "train")
eg_val   = extract_egemaps_split(df_val, "val")
eg_test  = extract_egemaps_split(df_test, "test")

print("\nColumns example:", list(eg_train.columns)[:10])


AUDIO_DIR: E:\tugas-akhir-qiqi\output\preprocessing\preprocessed_full
EG_DIR: E:\tugas-akhir-qiqi\output\baseline_official\egemaps
[TODO] train: 5988 / 5988 remaining


eGeMAPS | train: 100%|██████████| 5988/5988 [45:02<00:00,  2.22it/s] 


[DONE] train: saved -> E:\tugas-akhir-qiqi\output\baseline_official\egemaps\train_egemaps.csv | missing_audio=0
[LOAD] train: (5988, 89)
[TODO] val: 1994 / 1994 remaining


eGeMAPS | val: 100%|██████████| 1994/1994 [15:04<00:00,  2.20it/s]


[DONE] val: saved -> E:\tugas-akhir-qiqi\output\baseline_official\egemaps\val_egemaps.csv | missing_audio=0
[LOAD] val: (1994, 89)
[TODO] test: 1992 / 1992 remaining


eGeMAPS | test: 100%|██████████| 1992/1992 [19:29<00:00,  1.70it/s]

[DONE] test: saved -> E:\tugas-akhir-qiqi\output\baseline_official\egemaps\test_egemaps.csv | missing_audio=0
[LOAD] test: (1992, 89)

Columns example: ['clip_id', 'F0semitoneFrom27.5Hz_sma3nz_amean', 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm', 'F0semitoneFrom27.5Hz_sma3nz_percentile20.0', 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0', 'F0semitoneFrom27.5Hz_sma3nz_percentile80.0', 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2', 'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope', 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope', 'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope']





In [10]:
# =========================
# eGeMAPS -> StandardScaler + Ridge (baseline) + Eval + Save
# =========================

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

EG_DIR = ROOT / "output" / "baseline_official" / "egemaps"
assert EG_DIR.exists(), f"EG_DIR tidak ada: {EG_DIR}"

# --- load eGeMAPS features (kalau belum ada di memory)
eg_train = pd.read_csv(EG_DIR / "train_egemaps.csv")
eg_val   = pd.read_csv(EG_DIR / "val_egemaps.csv")
eg_test  = pd.read_csv(EG_DIR / "test_egemaps.csv")

print("eGeMAPS train/val/test:", eg_train.shape, eg_val.shape, eg_test.shape)

# --- pastikan urutan sama dengan df_train/df_val/df_test (berdasarkan clip_id)
# (ini penting kalau ada resume/append)
eg_train = eg_train.set_index("clip_id").loc[df_train["clip_id"].astype(str)].reset_index()
eg_val   = eg_val.set_index("clip_id").loc[df_val["clip_id"].astype(str)].reset_index()
eg_test  = eg_test.set_index("clip_id").loc[df_test["clip_id"].astype(str)].reset_index()

# --- X (fitur) dan ids
train_ids = eg_train["clip_id"].astype(str).tolist()
val_ids   = eg_val["clip_id"].astype(str).tolist()
test_ids  = eg_test["clip_id"].astype(str).tolist()

X_train = eg_train.drop(columns=["clip_id"]).to_numpy(dtype=np.float32)
X_val   = eg_val.drop(columns=["clip_id"]).to_numpy(dtype=np.float32)
X_test  = eg_test.drop(columns=["clip_id"]).to_numpy(dtype=np.float32)

print("X shapes:", X_train.shape, X_val.shape, X_test.shape)

# --- detect label cols (5 trait) dari metadata
def detect_label_cols(df):
    exclude = set([c for c in df.columns if any(k in c.lower() for k in ["clip", "id", "path", "file", "group", "split"])])
    num_cols = [c for c in df.columns if (c not in exclude) and pd.api.types.is_numeric_dtype(df[c])]
    if len(num_cols) < 5:
        raise ValueError(f"Kolom numerik kandidat label kurang dari 5: {num_cols}")
    return num_cols[:5]

label_cols = detect_label_cols(df_train)
print("Label cols:", label_cols)

y_train = df_train[label_cols].to_numpy(dtype=np.float32)
y_val   = df_val[label_cols].to_numpy(dtype=np.float32)
y_test  = df_test[label_cols].to_numpy(dtype=np.float32)

# --- scale (fit hanya di train)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# --- regressor
ridge = Ridge(alpha=1.0, random_state=42)
model = MultiOutputRegressor(ridge)
model.fit(X_train_s, y_train)

pred_val  = model.predict(X_val_s)
pred_test = model.predict(X_test_s)

# --- metrics (sesuai definisi Acc = 1 - MAE)
def metrics(y_true, y_pred, name=""):
    mae  = mean_absolute_error(y_true, y_pred, multioutput="raw_values")
    rmse = np.sqrt(mean_squared_error(y_true, y_pred, multioutput="raw_values"))
    r2   = r2_score(y_true, y_pred, multioutput="raw_values")
    acc  = 1.0 - mae

    dfm = pd.DataFrame({
        "trait": label_cols,
        "Acc(1-MAE)": acc,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
    })
    dfm.loc["mean"] = ["mean", acc.mean(), mae.mean(), rmse.mean(), r2.mean()]
    print(f"\n== {name} ==")
    display(dfm)
    return dfm

m_val  = metrics(y_val,  pred_val,  "eGeMAPS | VAL")
m_test = metrics(y_test, pred_test, "eGeMAPS | TEST")

# --- save outputs
OUT_DIR = ROOT / "output" / "baseline_official" / "results" / "egemaps"
OUT_DIR.mkdir(parents=True, exist_ok=True)

pd.DataFrame({"clip_id": val_ids, **{f"pred_{c}": pred_val[:,i] for i,c in enumerate(label_cols)}})\
  .to_csv(OUT_DIR / "pred_val.csv", index=False)
pd.DataFrame({"clip_id": test_ids, **{f"pred_{c}": pred_test[:,i] for i,c in enumerate(label_cols)}})\
  .to_csv(OUT_DIR / "pred_test.csv", index=False)

m_val.to_csv(OUT_DIR / "metrics_val.csv", index=False)
m_test.to_csv(OUT_DIR / "metrics_test.csv", index=False)

joblib.dump(scaler, OUT_DIR / "scaler.joblib")
joblib.dump(model, OUT_DIR / "ridge_multioutput.joblib")

print("\nSaved to:", OUT_DIR)


eGeMAPS train/val/test: (5988, 89) (1994, 89) (1992, 89)
X shapes: (5988, 88) (1994, 88) (1992, 88)
Label cols: ['Ethnicity', 'Gender', 'extraversion', 'neuroticism', 'agreeableness']

== eGeMAPS | VAL ==


Unnamed: 0,trait,Acc(1-MAE),MAE,RMSE,R2
0,Ethnicity,0.794028,0.205972,0.370875,0.012684
1,Gender,0.819662,0.180338,0.237674,0.772542
2,extraversion,0.89508,0.10492,0.131813,0.204679
3,neuroticism,0.894984,0.105016,0.132619,0.212329
4,agreeableness,0.905677,0.094323,0.120253,0.106579
mean,mean,0.861886,0.138114,0.198647,0.261763



== eGeMAPS | TEST ==


Unnamed: 0,trait,Acc(1-MAE),MAE,RMSE,R2
0,Ethnicity,0.799931,0.200069,0.374632,-0.093692
1,Gender,0.816548,0.183452,0.246369,0.75421
2,extraversion,0.892871,0.107129,0.13658,0.165297
3,neuroticism,0.893206,0.106794,0.13416,0.22859
4,agreeableness,0.900899,0.099101,0.124908,0.115289
mean,mean,0.860691,0.139309,0.20333,0.233939



Saved to: E:\tugas-akhir-qiqi\output\baseline_official\results\egemaps


In [None]:
import pandas as pd
from pathlib import Path

RESULT_ROOT = ROOT / "output" / "baseline_official" / "results"

METHODS = ["wav2vec2", "hubert", "wavlm", "egemaps"]  # <- tambah egemaps

def load_metrics(method: str, split: str) -> pd.DataFrame:
    p = RESULT_ROOT / method / f"metrics_{split}.csv"
    if not p.exists():
        raise FileNotFoundError(f"File tidak ditemukan: {p}")
    df = pd.read_csv(p)
    df["trait"] = df["trait"].astype(str)
    return df

def summarize_mean(df: pd.DataFrame) -> dict:
    # ambil baris mean; fallback hitung manual kalau ga ada
    mean_row = df[df["trait"] == "mean"]
    if len(mean_row) > 0:
        r = mean_row.iloc[0]
        return {
            "Acc(1-MAE)": r.get("Acc(1-MAE)", None),
            "MAE": r.get("MAE", None),
            "RMSE": r.get("RMSE", None),
            "R2": r.get("R2", None),
        }
    df2 = df[df["trait"] != "mean"].copy()
    return {
        "Acc(1-MAE)": df2["Acc(1-MAE)"].mean() if "Acc(1-MAE)" in df2.columns else None,
        "MAE": df2["MAE"].mean(),
        "RMSE": df2["RMSE"].mean(),
        "R2": df2["R2"].mean(),
    }

def add_ranks(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if "Acc(1-MAE)" in out.columns and out["Acc(1-MAE)"].notna().any():
        out["rank_Acc"] = out["Acc(1-MAE)"].rank(ascending=False, method="min")
    out["rank_MAE"] = out["MAE"].rank(ascending=True, method="min")
    out["rank_RMSE"] = out["RMSE"].rank(ascending=True, method="min")
    out["rank_R2"] = out["R2"].rank(ascending=False, method="min")

    rank_cols = [c for c in ["rank_Acc", "rank_MAE", "rank_RMSE", "rank_R2"] if c in out.columns]
    out["rank_total"] = out[rank_cols].mean(axis=1)
    return out.sort_values("rank_total")

# --- build mean tables
rows_val, rows_test = [], []
for m in METHODS:
    dfv = load_metrics(m, "val")
    dft = load_metrics(m, "test")

    rows_val.append({"method": m, **summarize_mean(dfv)})
    rows_test.append({"method": m, **summarize_mean(dft)})

cmp_val = add_ranks(pd.DataFrame(rows_val))
cmp_test = add_ranks(pd.DataFrame(rows_test))

print("=== MEAN METRICS (VAL) ===")
display(cmp_val)

print("\n=== MEAN METRICS (TEST) ===")
display(cmp_test)

# --- optional: per-trait table (VAL) for MAE/Acc/R2
def per_trait_table(split="val", metric_col="MAE"):
    tables = []
    for m in METHODS:
        df = load_metrics(m, split)
        df = df[df["trait"] != "mean"][["trait", metric_col]].copy()
        df = df.rename(columns={metric_col: m})
        tables.append(df.set_index("trait"))
    return pd.concat(tables, axis=1)

print("\n=== Per-trait Acc(1-MAE) (VAL) ===")
display(per_trait_table("val", "Acc(1-MAE)"))

print("\n=== Per-trait MAE (VAL) ===")
display(per_trait_table("val", "MAE"))

print("\n=== Per-trait R2 (VAL) ===")
display(per_trait_table("val", "R2"))


=== MEAN METRICS (VAL) ===


Unnamed: 0,method,Acc(1-MAE),MAE,RMSE,R2,rank_Acc,rank_MAE,rank_RMSE,rank_R2,rank_total
1,hubert,0.871574,0.128426,0.173836,0.357766,1.0,1.0,1.0,2.0,1.25
2,wavlm,0.871277,0.128723,0.17587,0.358129,2.0,2.0,3.0,1.0,2.0
0,wav2vec2,0.869954,0.130046,0.17502,0.350069,3.0,3.0,2.0,3.0,2.75
3,egemaps,0.861886,0.138114,0.198647,0.261763,4.0,4.0,4.0,4.0,4.0



=== MEAN METRICS (TEST) ===


Unnamed: 0,method,Acc(1-MAE),MAE,RMSE,R2,rank_Acc,rank_MAE,rank_RMSE,rank_R2,rank_total
2,wavlm,0.870173,0.129827,0.176268,0.35801,1.0,1.0,1.0,1.0,1.0
1,hubert,0.869379,0.130621,0.177095,0.344025,2.0,2.0,2.0,2.0,2.0
0,wav2vec2,0.868692,0.131308,0.177581,0.330491,3.0,3.0,3.0,3.0,3.0
3,egemaps,0.860691,0.139309,0.20333,0.233939,4.0,4.0,4.0,4.0,4.0



=== Per-trait Acc(1-MAE) (VAL) ===


Unnamed: 0_level_0,wav2vec2,hubert,wavlm,egemaps
trait,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ethnicity,0.765696,0.774493,0.780407,0.794028
Gender,0.880884,0.878411,0.86769,0.819662
extraversion,0.897467,0.898941,0.902618,0.89508
neuroticism,0.89983,0.900084,0.901067,0.894984
agreeableness,0.905892,0.905943,0.904602,0.905677



=== Per-trait MAE (VAL) ===


Unnamed: 0_level_0,wav2vec2,hubert,wavlm,egemaps
trait,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ethnicity,0.234304,0.225507,0.219593,0.205972
Gender,0.119116,0.121589,0.13231,0.180338
extraversion,0.102533,0.101059,0.097382,0.10492
neuroticism,0.10017,0.099916,0.098933,0.105016
agreeableness,0.094108,0.094057,0.095398,0.094323



=== Per-trait R2 (VAL) ===


Unnamed: 0_level_0,wav2vec2,hubert,wavlm,egemaps
trait,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ethnicity,0.188803,0.231725,0.199257,0.012684
Gender,0.888042,0.883594,0.872871,0.772542
extraversion,0.252186,0.278494,0.315092,0.204679
neuroticism,0.295957,0.281676,0.306512,0.212329
agreeableness,0.125357,0.11334,0.096911,0.106579
