In [4]:
# === INIT: Strict Split (load + standardize) ===
from pathlib import Path
import pandas as pd
import numpy as np

SEED = 42
rng = np.random.default_rng(SEED)

def pick_existing(paths):
    for p in paths:
        if p and p.exists():
            return p
    return None

def group_id_from_clip_id(clip_id: str) -> str:
    return str(clip_id).split(".")[0]  # contoh: "--Ymqszjv54.001" -> "--Ymqszjv54"

CWD = Path.cwd()
ROOT = CWD.parent

# Kandidat lokasi output preprocessing (biar aman kalau dulu pakai relatif vs ROOT)
PREP_DIR = ROOT / "output" / "preprocessing"

if PREP_DIR is None:
    raise FileNotFoundError("Folder output preprocessing tidak ketemu. Cek apakah 'output/preprocessing' sudah ada.")

# Manifest trim/pad (anggap ini 'manifest.csv' di diagram kamu)
MANIFEST_PATH = pick_existing([
    PREP_DIR / "trim_pad_manifest.csv",
    PREP_DIR / "manifest.csv",
])

if MANIFEST_PATH is None:
    raise FileNotFoundError(f"Manifest tidak ketemu di {PREP_DIR}. Cari 'trim_pad_manifest.csv' atau 'manifest.csv'.")

# Meta master (gabungan label + demografi + group_id)
META_PATH = PREP_DIR / "meta_master.csv"
if not META_PATH.exists():
    raise FileNotFoundError(f"meta_master.csv tidak ketemu: {META_PATH}")

# VAD (opsional tapi biasanya kepakai buat N_clean)
VAD_REPORT_PATH = pick_existing([PREP_DIR / "vad" / "vad_report.csv"])
VAD_DROP_PATH   = pick_existing([PREP_DIR / "vad" / "vad_drop.csv"])

# Output strict split
STRICT_DIR = ROOT / "output" / "split_strict"
STRICT_DIR.mkdir(parents=True, exist_ok=True)

print("ROOT     :", ROOT)
print("PREP_DIR :", PREP_DIR)
print("MANIFEST :", MANIFEST_PATH.name)
print("META     :", META_PATH.name)
print("VAD rep  :", VAD_REPORT_PATH.name if VAD_REPORT_PATH else "(none)")
print("VAD drop :", VAD_DROP_PATH.name if VAD_DROP_PATH else "(none)")
print("OUT split:", STRICT_DIR)

# --- Load ---
df_manifest = pd.read_csv(MANIFEST_PATH)
df_meta = pd.read_csv(META_PATH)

# Pastikan clip_id string
for df in [df_manifest, df_meta]:
    if "clip_id" in df.columns:
        df["clip_id"] = df["clip_id"].astype(str)

# Standarisasi kolom demografi biar enak dipakai di split
rename_map = {
    "Gender": "gender",
    "Ethnicity": "ethnicity",
    "AgeGroup": "age_group",
    "split_official": "split_official",
    "group_id": "group_id",
}
df_meta = df_meta.rename(columns={k: v for k, v in rename_map.items() if k in df_meta.columns})

# Kalau group_id belum ada, bikin dari clip_id
if "group_id" not in df_meta.columns:
    df_meta["group_id"] = df_meta["clip_id"].map(group_id_from_clip_id)

# Merge manifest + meta (pool utama untuk strict split)
df = df_manifest.merge(df_meta, on="clip_id", how="inner", validate="one_to_one")

# Tambahin VAD info kalau ada
if VAD_REPORT_PATH:
    df_vad = pd.read_csv(VAD_REPORT_PATH)
    df_vad["clip_id"] = df_vad["clip_id"].astype(str)
    df = df.merge(df_vad, on="clip_id", how="left")

vad_drop_ids = set()
if VAD_DROP_PATH:
    df_drop = pd.read_csv(VAD_DROP_PATH)
    if "clip_id" in df_drop.columns:
        vad_drop_ids = set(df_drop["clip_id"].astype(str).tolist())

df["is_vad_drop"] = df["clip_id"].isin(vad_drop_ids)

# Hitung avg_trait per sample (kalau kolom trait ada)
trait_cols = [c for c in ["extraversion","neuroticism","agreeableness","conscientiousness","openness"] if c in df.columns]
if trait_cols:
    df["avg_trait"] = df[trait_cols].mean(axis=1)
else:
    df["avg_trait"] = np.nan

# Pool clean (yang akan dipakai ukuran split, N_clean)
df_clean = df.loc[~df["is_vad_drop"]].reset_index(drop=True)

print("\n=== Summary ===")
print("Total merged        :", len(df))
print("Total clean (no VAD) :", len(df_clean))
print("Unique group_id all  :", df["group_id"].nunique() if "group_id" in df.columns else "N/A")
print("Unique group_id clean:", df_clean["group_id"].nunique() if "group_id" in df_clean.columns else "N/A")

# Object yang dipakai di step berikutnya:
# df        -> semua sample hasil merge
# df_clean  -> kandidat untuk strict split (mengikuti N_clean)


ROOT     : e:\tugas-akhir-qiqi
PREP_DIR : e:\tugas-akhir-qiqi\output\preprocessing
MANIFEST : trim_pad_manifest.csv
META     : meta_master.csv
VAD rep  : vad_report.csv
VAD drop : vad_drop.csv
OUT split: e:\tugas-akhir-qiqi\output\split_strict

=== Summary ===
Total merged        : 10000
Total clean (no VAD) : 9974
Unique group_id all  : 3060
Unique group_id clean: 3054


In [11]:
import pandas as pd

# pilih kolom penting yang realistis kepakai
base_keep = [
    "clip_id",
    "group_id",
    "audio_out",          # penting buat manifest output split
    "gender",
    "ethnicity",
    "age_group",
    "avg_trait",          # buat binning stratifikasi
    "extraversion",
    "neuroticism",
    "agreeableness",
    "conscientiousness",
    "openness",
    "split_official",     # opsional, buat compare sama official split
    "is_vad_drop",        # opsional (harusnya false semua di df_clean)
]

# handle kasus kolom split_official kamu kebaca split_official_x
if "split_official" not in df_clean.columns and "split_official_x" in df_clean.columns:
    df_clean = df_clean.rename(columns={"split_official_x": "split_official"})

# keep yang memang ada saja
keep_cols = [c for c in base_keep if c in df_clean.columns]

df_split = df_clean[keep_cols].copy()

print("df_clean cols :", len(df_clean.columns))
print("df_split cols :", len(df_split.columns))
print("df_split head cols:", df_split.columns.tolist())
print("rows:", len(df_split), "unique groups:", df_split["group_id"].nunique())


df_clean cols : 29
df_split cols : 14
df_split head cols: ['clip_id', 'group_id', 'audio_out', 'gender', 'ethnicity', 'age_group', 'avg_trait', 'extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness', 'split_official', 'is_vad_drop']
rows: 9974 unique groups: 3054


In [13]:
df_split.head()

Unnamed: 0,clip_id,group_id,audio_out,gender,ethnicity,age_group,avg_trait,extraversion,neuroticism,agreeableness,conscientiousness,openness,split_official,is_vad_drop
0,--Ymqszjv54.001,--Ymqszjv54,output\preprocessing\preprocessed_full\--Ymqsz...,1,2,5,0.594761,0.551402,0.5,0.527473,0.650485,0.744444,train,False
1,--Ymqszjv54.003,--Ymqszjv54,output\preprocessing\preprocessed_full\--Ymqsz...,1,2,5,0.455697,0.392523,0.427083,0.516484,0.475728,0.466667,train,False
2,--Ymqszjv54.004,--Ymqszjv54,output\preprocessing\preprocessed_full\--Ymqsz...,1,2,5,0.4207,0.317757,0.322917,0.549451,0.368932,0.544444,train,False
3,--Ymqszjv54.005,--Ymqszjv54,output\preprocessing\preprocessed_full\--Ymqsz...,1,2,5,0.325838,0.299065,0.291667,0.373626,0.320388,0.344444,train,False
4,-2qsCrkXdWs.001,-2qsCrkXdWs,output\preprocessing\preprocessed_full\-2qsCrk...,1,2,2,0.571627,0.476636,0.604167,0.593407,0.572816,0.611111,train,False


In [14]:
# === TEST: Coba berbagai definisi STRATA (buat milih kolom yang paling stabil) ===
import pandas as pd
import numpy as np

# ---- cek kolom yang ada ----
cols = set(df_split.columns)
print("df_split cols:", sorted(list(cols)))

required_min = {"clip_id", "group_id", "avg_trait"}
if not required_min.issubset(cols):
    raise ValueError(f"df_split harus punya minimal {sorted(list(required_min))}")

# ---- helper mode stabil ----
def safe_mode(s: pd.Series):
    s = s.dropna()
    if s.empty:
        return np.nan
    vc = s.value_counts()
    top = vc[vc == vc.max()].index.tolist()
    try:
        return sorted(top)[0]  # tie-break deterministik
    except Exception:
        return top[0]

def norm_cat(x):
    if pd.isna(x):
        return "UNK"
    x = str(x).strip()
    if x == "" or x.lower() in {"nan", "none"}:
        return "UNK"
    return x

# ---- bangun df_group minimal (1 baris per group_id) ----
agg = {"clip_id": "count", "avg_trait": "mean"}
if "gender" in cols:    agg["gender"] = safe_mode
if "ethnicity" in cols: agg["ethnicity"] = safe_mode
if "age_group" in cols: agg["age_group"] = safe_mode

df_tmp = df_split.copy()
df_tmp["clip_id"] = df_tmp["clip_id"].astype(str)
df_tmp["group_id"] = df_tmp["group_id"].astype(str)

df_group = (
    df_tmp.groupby("group_id", as_index=False)
          .agg(agg)
          .rename(columns={
              "clip_id": "n_clips",
              "avg_trait": "avg_trait_group",
              "gender": "gender_group",
              "ethnicity": "ethnicity_group",
              "age_group": "age_group_group",
          })
)

# norm kategori (jadi string biar key strata konsisten)
for c in ["gender_group", "ethnicity_group", "age_group_group"]:
    if c in df_group.columns:
        df_group[c] = df_group[c].map(norm_cat)

# ---- bikin avg_bin untuk uji stratifikasi label ----
N_BINS = 5
s = df_group["avg_trait_group"]
try:
    df_group["avg_bin"] = pd.qcut(s, q=N_BINS, duplicates="drop").astype(str)
except ValueError:
    df_group["avg_bin"] = pd.cut(s, bins=N_BINS).astype(str)

print("\nGroup summary:")
print("groups:", len(df_group), "| total clips:", int(df_group["n_clips"].sum()))
print("avg clips/group:", round(df_group["n_clips"].mean(), 3))

# ---- daftar kandidat strata yang mau diuji ----
candidates = []

def add_candidate(name, parts):
    # parts: list kolom yang harus ada
    if all(p in df_group.columns for p in parts):
        candidates.append((name, parts))

add_candidate("gender|ethnicity|avg_bin", ["gender_group","ethnicity_group","avg_bin"])
add_candidate("gender|ethnicity|age|avg_bin", ["gender_group","ethnicity_group","age_group_group","avg_bin"])
add_candidate("gender|avg_bin", ["gender_group","avg_bin"])
add_candidate("ethnicity|avg_bin", ["ethnicity_group","avg_bin"])
add_candidate("age|avg_bin", ["age_group_group","avg_bin"])
add_candidate("gender|ethnicity", ["gender_group","ethnicity_group"])
add_candidate("gender|age", ["gender_group","age_group_group"])
add_candidate("ethnicity|age", ["ethnicity_group","age_group_group"])

if not candidates:
    raise ValueError("Tidak ada kandidat strata yang bisa dibuat (kolom demografi tidak ada).")

# ---- fungsi evaluasi strata ----
def eval_strata(parts, small_thr=3):
    key = df_group[parts].astype(str).agg("|".join, axis=1)
    vc = key.value_counts()
    return {
        "parts": "|".join(parts),
        "n_strata": int(vc.shape[0]),
        "min": int(vc.min()),
        "p10": int(np.percentile(vc.values, 10)),
        "median": int(np.median(vc.values)),
        "p90": int(np.percentile(vc.values, 90)),
        "max": int(vc.max()),
        "small_<3": int((vc < 3).sum()),
        "small_<5": int((vc < 5).sum()),
        "top1_count": int(vc.iloc[0]),
        "top5_preview": vc.head(5).to_dict(),
    }

rows = []
for name, parts in candidates:
    r = eval_strata(parts)
    r["name"] = name
    rows.append(r)

df_eval = pd.DataFrame(rows).sort_values(
    by=["small_<3", "small_<5", "n_strata"], ascending=[True, True, True]
).reset_index(drop=True)

# tampilkan ringkasan
display(df_eval[[
    "name","parts","n_strata","min","median","max","small_<3","small_<5","top1_count"
]])

# opsional: lihat preview 5 strata teratas utk kandidat terbaik
best = df_eval.iloc[0]
print("\nBest candidate by (fewest small strata):", best["name"])
print("Top-5 strata counts preview:")
for k,v in best["top5_preview"].items():
    print(f"  {k} : {v}")


df_split cols: ['age_group', 'agreeableness', 'audio_out', 'avg_trait', 'clip_id', 'conscientiousness', 'ethnicity', 'extraversion', 'gender', 'group_id', 'is_vad_drop', 'neuroticism', 'openness', 'split_official']

Group summary:
groups: 3054 | total clips: 9974
avg clips/group: 3.266


Unnamed: 0,name,parts,n_strata,min,median,max,small_<3,small_<5,top1_count
0,gender|ethnicity,gender_group|ethnicity_group,6,33,164,1446,0,0,1446
1,gender|avg_bin,gender_group|avg_bin,10,159,305,452,0,0,452
2,ethnicity|avg_bin,ethnicity_group|avg_bin,15,14,65,559,0,0,559
3,gender|age,gender_group|age_group_group,14,4,57,916,0,1,916
4,gender|ethnicity|avg_bin,gender_group|ethnicity_group|avg_bin,30,2,29,414,1,2,414
5,ethnicity|age,ethnicity_group|age_group_group,18,1,38,1276,1,3,1276
6,age|avg_bin,age_group_group|avg_bin,34,1,22,334,3,7,334
7,gender|ethnicity|age|avg_bin,gender_group|ethnicity_group|age_group_group|a...,138,1,6,229,38,60,229



Best candidate by (fewest small strata): gender|ethnicity
Top-5 strata counts preview:
  2|2 : 1446
  1|2 : 1171
  2|3 : 219
  1|3 : 109
  2|1 : 76


## **Kesimpulan Uji Kandidat Strata untuk Strict Split**

### Ringkasan Data (hasil preprocessing + filter clean)

* Total **group_id**: **3054**
* Total **clip**: **9974**
* Rata-rata **clip per group**: **3.266**

Split strict dilakukan pada level **group_id** untuk mencegah kebocoran identitas antar train, validation, dan test. Karena itu, pemilihan **strata** ditujukan untuk menjaga **keseimbangan distribusi atribut** antar split.

---

### Hasil Evaluasi Kandidat Strata

Kriteria utama yang dipakai adalah **jumlah strata kecil** (`small_<3` dan `small_<5`). Strata yang terlalu kecil membuat stratified split menjadi tidak stabil karena beberapa kombinasi kategori tidak cukup sampel untuk dibagi ke 3 subset (3:1:1).

**1) Strata paling stabil**

* **`gender|ethnicity`**

  * `n_strata = 6`
  * `small_<3 = 0`, `small_<5 = 0`
  * Artinya semua strata memiliki ukuran cukup besar sehingga pembagian train/val/test lebih robust.

**2) Alternatif yang masih stabil (tanpa strata kecil)**

* **`gender|avg_bin`** (`n_strata = 10`, `small_<3 = 0`, `small_<5 = 0`)
* **`ethnicity|avg_bin`** (`n_strata = 15`, `small_<3 = 0`, `small_<5 = 0`)

Alternatif ini juga stabil, namun memasukkan `avg_bin` akan menambah dimensi label sehingga jumlah strata meningkat dan berpotensi membuat distribusi beberapa strata makin tipis pada skenario lain.

**3) Kandidat yang mulai berisiko**

* **`gender|age`** memiliki `small_<5 = 1`
* **`gender|ethnicity|avg_bin`** memiliki `small_<3 = 1` dan `small_<5 = 2`
* Kombinasi ini sudah menghasilkan beberapa strata yang terlalu kecil, sehingga stratified split dapat menjadi kurang konsisten.

**4) Kandidat yang tidak direkomendasikan**

* **`gender|ethnicity|age|avg_bin`**

  * `n_strata = 138`
  * `small_<3 = 38`, `small_<5 = 60`
  * Banyak strata berukuran sangat kecil, sehingga pembagian 3:1:1 per strata sulit dilakukan dan hasil split cenderung tidak stabil.

---

### Rekomendasi Strata untuk Strict Split

Berdasarkan uji ini, strata yang paling aman dan stabil adalah:

* âœ… **Gunakan: `gender|ethnicity`** sebagai strata utama untuk strict split.

Strata ini memberi keseimbangan demografis dasar tanpa menciptakan banyak strata kecil. Nilai `avg_trait` dan `age_group` tetap dapat digunakan untuk **evaluasi distribusi setelah split**, meskipun tidak dimasukkan sebagai kunci stratifikasi.

---

### Catatan Distribusi Strata Terbesar

Strata terbesar adalah `2|2` dengan **1446 group**, diikuti `1|2` dengan **1171 group**, sehingga dataset menunjukkan dominasi pada kombinasi gender dan ethnicity tertentu. Ini semakin menguatkan alasan untuk menjaga keseimbangan minimal melalui `gender|ethnicity` saat split.


In [15]:
# === STEP 3: Strict Split 3:1:1 (group-level) stratified by gender|ethnicity ===
import pandas as pd
import numpy as np

SEED = 42
rng = np.random.default_rng(SEED)

# --- sanity check ---
need = ["clip_id", "group_id", "gender", "ethnicity"]
missing = [c for c in need if c not in df_split.columns]
if missing:
    raise ValueError(f"Kolom wajib tidak ada di df_split: {missing}")

df_tmp = df_split.copy()
df_tmp["clip_id"] = df_tmp["clip_id"].astype(str)
df_tmp["group_id"] = df_tmp["group_id"].astype(str)

# optional: pastikan yang dipakai memang clean
if "is_vad_drop" in df_tmp.columns:
    bad = int(df_tmp["is_vad_drop"].sum())
    if bad > 0:
        print(f"[WARN] df_split masih mengandung is_vad_drop=True sebanyak {bad}. "
              "Sebaiknya df_split hanya berisi clean samples (df_clean).")

# --- helper: mode stabil ---
def safe_mode(s: pd.Series):
    s = s.dropna()
    if s.empty:
        return np.nan
    vc = s.value_counts()
    top = vc[vc == vc.max()].index.tolist()
    try:
        return sorted(top)[0]  # tie-break deterministik
    except Exception:
        return top[0]

# --- 1) group-level table (1 row per group_id) ---
g = (
    df_tmp.groupby("group_id", as_index=False)
          .agg(
              n_clips=("clip_id", "count"),
              gender_group=("gender", safe_mode),
              ethnicity_group=("ethnicity", safe_mode),
          )
)

g["strata"] = g["gender_group"].astype(str) + "|" + g["ethnicity_group"].astype(str)

# --- 2) stratified split per strata (target 3:1:1 by GROUP count) ---
def split_counts(n, ratios=(3,1,1)):
    a,b,c = ratios
    total = a+b+c
    n_train = int(np.floor(n * a / total))
    n_val   = int(np.floor(n * b / total))
    n_test  = n - n_train - n_val
    return n_train, n_val, n_test

rows = []
for strata, sub in g.groupby("strata"):
    ids = sub["group_id"].to_numpy()
    rng.shuffle(ids)

    n = len(ids)
    n_train, n_val, n_test = split_counts(n, ratios=(3,1,1))

    # distribusikan sisa (kalau ada) biar mendekati 3:1:1
    # n_test sudah menampung sisa by design, jadi tetap total pas.
    train_ids = ids[:n_train]
    val_ids   = ids[n_train:n_train+n_val]
    test_ids  = ids[n_train+n_val:]

    rows.append(pd.DataFrame({
        "group_id": np.concatenate([train_ids, val_ids, test_ids]),
        "split_strict": (["train"]*len(train_ids)) + (["val"]*len(val_ids)) + (["test"]*len(test_ids)),
        "strata": strata
    }))

df_group_split = pd.concat(rows, ignore_index=True)

# --- 3) validations (group disjoint) ---
# setiap group_id harus cuma muncul 1 split
chk = df_group_split.groupby("group_id")["split_strict"].nunique()
bad = chk[chk > 1]
if len(bad) > 0:
    raise RuntimeError(f"Ada group_id yang masuk lebih dari satu split: {len(bad)}")

# semua group harus ke-assign
if df_group_split["split_strict"].isna().any():
    raise RuntimeError("Ada group_id yang tidak ter-assign split_strict.")

# --- 4) propagate to clip-level ---
df_strict = df_tmp.merge(df_group_split[["group_id","split_strict"]], on="group_id", how="left")

if df_strict["split_strict"].isna().any():
    raise RuntimeError("Ada clip yang tidak mendapatkan split_strict (cek merge group_id).")

# --- 5) reporting ---
print("=== Split size (GROUP count) ===")
print(df_group_split["split_strict"].value_counts())

print("\n=== Split size (CLIP count) ===")
print(df_strict["split_strict"].value_counts())

print("\n=== Strata distribution (GROUP count) per split ===")
tab = pd.crosstab(df_group_split["strata"], df_group_split["split_strict"])
display(tab)

# opsional: bandingkan distribusi gender & ethnicity di level group
print("\n=== Gender distribution (GROUP) per split ===")
display(pd.crosstab(df_group_split["split_strict"], df_group_split["group_id"].map(
    dict(zip(g["group_id"], g["gender_group"]))
)))

print("\n=== Ethnicity distribution (GROUP) per split ===")
display(pd.crosstab(df_group_split["split_strict"], df_group_split["group_id"].map(
    dict(zip(g["group_id"], g["ethnicity_group"]))
 )))

# output:
# df_group_split: mapping group_id -> split_strict
# df_strict     : per-clip dataframe + split_strict
df_strict.head()


=== Split size (GROUP count) ===
split_strict
train    1829
test      617
val       608
Name: count, dtype: int64

=== Split size (CLIP count) ===
split_strict
train    5936
test     2039
val      1999
Name: count, dtype: int64

=== Strata distribution (GROUP count) per split ===


split_strict,test,train,val
strata,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1|1,8,19,6
1|2,235,702,234
1|3,23,65,21
2|1,16,45,15
2|2,290,867,289
2|3,45,131,43



=== Gender distribution (GROUP) per split ===


group_id,1,2
split_strict,Unnamed: 1_level_1,Unnamed: 2_level_1
test,266,351
train,786,1043
val,261,347



=== Ethnicity distribution (GROUP) per split ===


group_id,1,2,3
split_strict,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,24,525,68
train,64,1569,196
val,21,523,64


Unnamed: 0,clip_id,group_id,audio_out,gender,ethnicity,age_group,avg_trait,extraversion,neuroticism,agreeableness,conscientiousness,openness,split_official,is_vad_drop,split_strict
0,--Ymqszjv54.001,--Ymqszjv54,output\preprocessing\preprocessed_full\--Ymqsz...,1,2,5,0.594761,0.551402,0.5,0.527473,0.650485,0.744444,train,False,train
1,--Ymqszjv54.003,--Ymqszjv54,output\preprocessing\preprocessed_full\--Ymqsz...,1,2,5,0.455697,0.392523,0.427083,0.516484,0.475728,0.466667,train,False,train
2,--Ymqszjv54.004,--Ymqszjv54,output\preprocessing\preprocessed_full\--Ymqsz...,1,2,5,0.4207,0.317757,0.322917,0.549451,0.368932,0.544444,train,False,train
3,--Ymqszjv54.005,--Ymqszjv54,output\preprocessing\preprocessed_full\--Ymqsz...,1,2,5,0.325838,0.299065,0.291667,0.373626,0.320388,0.344444,train,False,train
4,-2qsCrkXdWs.001,-2qsCrkXdWs,output\preprocessing\preprocessed_full\-2qsCrk...,1,2,2,0.571627,0.476636,0.604167,0.593407,0.572816,0.611111,train,False,train


In [16]:
# === CEK is_vad_drop ===
import pandas as pd

if "is_vad_drop" not in df_split.columns:
    print("Kolom 'is_vad_drop' tidak ada di df_split.")
else:
    n_true = int(df_split["is_vad_drop"].fillna(False).astype(bool).sum())
    n_all  = len(df_split)
    print(f"is_vad_drop=True: {n_true} / {n_all}")

    if n_true > 0:
        display(
            df_split.loc[df_split["is_vad_drop"] == True, 
                        ["clip_id","group_id","audio_out","split_official","gender","ethnicity","age_group"]]
            .head(20)
        )


is_vad_drop=True: 0 / 9974


In [17]:
# === SAVE: group_split_strict.csv + manifest_strict.csv ===
from pathlib import Path
import pandas as pd

# pastikan output dari cell split sudah ada
for var in ["df_group_split", "df_strict"]:
    if var not in globals():
        raise NameError(f"Variabel '{var}' belum ada. Jalankan dulu cell split strict yang menghasilkan {var}.")

# drop kolom is_vad_drop (kalau masih ada)
if "is_vad_drop" in df_strict.columns:
    df_strict = df_strict.drop(columns=["is_vad_drop"])

# folder output
OUT_DIR = ROOT / Path("output") / "split_strict"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- buat group_split_strict versi lebih informatif (tambahin n_clips + mode gender/ethnicity) ---
def safe_mode(s: pd.Series):
    s = s.dropna()
    if s.empty:
        return pd.NA
    m = s.mode()
    return m.iloc[0] if len(m) else pd.NA

group_info = (
    df_strict.groupby("group_id", as_index=False)
    .agg(
        n_clips=("clip_id", "count"),
        gender_group=("gender", safe_mode),
        ethnicity_group=("ethnicity", safe_mode),
    )
)

df_group_out = df_group_split.merge(group_info, on="group_id", how="left")

# urutkan kolom biar enak dibaca
preferred = ["group_id", "split_strict", "strata", "n_clips", "gender_group", "ethnicity_group"]
cols = [c for c in preferred if c in df_group_out.columns] + [c for c in df_group_out.columns if c not in preferred]
df_group_out = df_group_out[cols].sort_values(["split_strict", "group_id"]).reset_index(drop=True)

# manifest strict (clip-level)
df_manifest_out = df_strict.sort_values(["split_strict", "group_id", "clip_id"]).reset_index(drop=True)

# simpan
path_group = OUT_DIR / "group_split_strict.csv"
path_manifest = OUT_DIR / "manifest_strict.csv"

df_group_out.to_csv(path_group, index=False)
df_manifest_out.to_csv(path_manifest, index=False)

print("Saved:")
print(" -", path_group.as_posix(), "| rows:", len(df_group_out), "| unique groups:", df_group_out["group_id"].nunique())
print(" -", path_manifest.as_posix(), "| rows:", len(df_manifest_out), "| unique clips:", df_manifest_out["clip_id"].nunique())


Saved:
 - e:/tugas-akhir-qiqi/output/split_strict/group_split_strict.csv | rows: 3054 | unique groups: 3054
 - e:/tugas-akhir-qiqi/output/split_strict/manifest_strict.csv | rows: 9974 | unique clips: 9974
