# **Imports + Config**

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm
import hashlib
import warnings

TARGET_SR = 16000
TARGET_SEC = 15.0
MIN_SEC = 1.0
MIN_SPEECH_SEC = 2.0

warnings.filterwarnings("ignore")


# **Path split**

In [2]:
ROOT = Path.cwd().parent
DATASET = ROOT / "Dataset"

SPLITS = {
    "Train": {
        "split_root": DATASET / "Train",
        "audio_dir":  DATASET / "Train" / "train" / "audio",
    },
    "Val": {
        "split_root": DATASET / "Val",
        "audio_dir":  DATASET / "Val" / "val" / "audio",
    },
    "Test": {
        "split_root": DATASET / "Test",
        "audio_dir":  DATASET / "Test" / "test" / "audio",
    },
}

for k,v in SPLITS.items():
    print(k, "audio_dir exists?", v["audio_dir"].exists(), "->", v["audio_dir"])


Train audio_dir exists? True -> e:\tugas-akhir-qiqi\Dataset\Train\train\audio
Val audio_dir exists? True -> e:\tugas-akhir-qiqi\Dataset\Val\val\audio
Test audio_dir exists? True -> e:\tugas-akhir-qiqi\Dataset\Test\test\audio


# **Cek metadata**

In [3]:
#metadata etnichity & gender (csv)
EnG_dev_csv = DATASET / "Eth_gender_annotation" / "eth_gender_annotations_dev.csv"
EnG_test_csv = DATASET / "Eth_gender_annotation" / "eth_gender_annotations_test.csv"

#Age Anotation (csv)
Age_dev_csv = DATASET / "Age" / "age_anno_dev.csv"
Age_test_csv = DATASET / "Age" / "age_anno_test.csv"

## load metadata csv and print
EnG_dev_df = pd.read_csv(EnG_dev_csv, sep=';')
EnG_test_df = pd.read_csv(EnG_test_csv, sep=';')
Age_dev_df = pd.read_csv(Age_dev_csv)
Age_test_df = pd.read_csv(Age_test_csv)
print("EnG_dev_df shape:", EnG_dev_df.shape)
print("EnG_test_df shape:", EnG_test_df.shape)
print("Age_dev_df shape:", Age_dev_df.shape)
print("Age_test_df shape:", Age_test_df.shape)
print("\n")

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print("EnG_dev_df head:\n\n", EnG_dev_df.head())
    print("EnG_test_df head:\n\n", EnG_test_df.head())
    print("Age_dev_df head:\n\n", Age_dev_df.head())
    print("Age_test_df head:\n\n", Age_test_df.head())

EnG_dev_df shape: (8000, 4)
EnG_test_df shape: (2000, 4)
Age_dev_df shape: (8000, 3)
Age_test_df shape: (2000, 3)


EnG_dev_df head:

              VideoName    YouTubeID  Ethnicity  Gender
0  --Ymqszjv54.001.mp4  --Ymqszjv54          2       1
1  --Ymqszjv54.003.mp4  --Ymqszjv54          2       1
2  --Ymqszjv54.004.mp4  --Ymqszjv54          2       1
3  --Ymqszjv54.005.mp4  --Ymqszjv54          2       1
4  -2qsCrkXdWs.001.mp4  -2qsCrkXdWs          2       1
EnG_test_df head:

              VideoName    YouTubeID  Ethnicity  Gender
0  --Ymqszjv54.000.mp4  --Ymqszjv54          2       1
1  -10-QQDO_ME.001.mp4  -10-QQDO_ME          2       2
2  -10-QQDO_ME.002.mp4  -10-QQDO_ME          2       2
3  -10-QQDO_ME.005.mp4  -10-QQDO_ME          2       2
4  -4J4xkfN5cI.000.mp4  -4J4xkfN5cI          2       2
Age_dev_df head:

              VideoName    YouTubeID  AgeGroup
0  --Ymqszjv54.001.mp4  --Ymqszjv54         5
1  --Ymqszjv54.003.mp4  --Ymqszjv54         5
2  --Ymqszjv54.004.mp4  --Ym

Karena ada dev dan test yang ada di EnG dan juga di Age csv, maka keputusan yaitu merge jadi 2 metadata, yaitu dev dan test dan digabungin sekalian dengan Big 5 yang ada di anotasi tiap split.


In [4]:
import pickle

def load_pkl(path: Path):
    with open(path, "rb") as f:
        try:
            return pickle.load(f)
        except UnicodeDecodeError:
            f.seek(0)
            return pickle.load(f, encoding="latin1")

def inspect_annotation(split_name: str):
    ann_dir = SPLITS[split_name]["split_root"] / "annotation"
    print(f"\n===== {split_name} =====")
    print("annotation dir:", ann_dir)

    if not ann_dir.exists():
        print("-> annotation folder tidak ada")
        return

    pkls = sorted(list(ann_dir.glob("*.pkl")) + list(ann_dir.glob("*.pickle")))
    print("PKL found:", [p.name for p in pkls])

    if not pkls:
        print("-> tidak ada pkl")
        return

    # pilih pkl terbesar (biasanya utama)
    pkl_path = max(pkls, key=lambda p: p.stat().st_size)
    print("Using PKL:", pkl_path.name, "| size:", pkl_path.stat().st_size)

    obj = load_pkl(pkl_path)
    print("Loaded type:", type(obj))

    # kalau sudah DataFrame
    if isinstance(obj, pd.DataFrame):
        df = obj
        print("Already DataFrame. shape:", df.shape)
        display(df.head())
        return

    # kalau dict
    if isinstance(obj, dict):
        keys = list(obj.keys())
        print("Dict keys sample:", keys[:10])
        first_key = keys[0] if keys else None
        first_val = obj[first_key] if first_key is not None else None
        print("First value type:", type(first_val))

        # Coba bikin DataFrame (akan benar kalau dict-of-dict atau dict-of-list)
        try:
            df = pd.DataFrame(obj)
            print("pd.DataFrame(obj) shape:", df.shape)
            display(df.head())
            print("Index sample:", list(df.index)[:10])
            print("Columns sample:", list(df.columns)[:10])

            # indikasi butuh transpose:
            # - kalau index itu trait (extraversion, openness, ...) dan columns itu video, maka harus transpose
            return
        except Exception as e:
            print("pd.DataFrame(obj) failed:", e)
            return

    # kalau list
    if isinstance(obj, list):
        print("List len:", len(obj))
        print("First elem type:", type(obj[0]) if obj else None)
        try:
            df = pd.DataFrame(obj)
            print("pd.DataFrame(list) shape:", df.shape)
            display(df.head())
        except Exception as e:
            print("pd.DataFrame(list) failed:", e)
        return

    # tipe lain
    print("Unhandled type; coba print repr pendek:")
    print(repr(obj)[:500])

# Jalankan satu-satu
inspect_annotation("Train")



===== Train =====
annotation dir: e:\tugas-akhir-qiqi\Dataset\Train\annotation
PKL found: ['annotation_training.pkl']
Using PKL: annotation_training.pkl | size: 793769
Loaded type: <class 'dict'>
Dict keys sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']
First value type: <class 'dict'>
pd.DataFrame(obj) shape: (6000, 6)


Unnamed: 0,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness
J4GQm9j0JZ0.003.mp4,0.523364,0.552083,0.626374,0.601942,0.504673,0.488889
zEyRyTnIw5I.005.mp4,0.345794,0.375,0.472527,0.582524,0.457944,0.366667
nskJh7v6v1U.004.mp4,0.252336,0.291667,0.406593,0.485437,0.373832,0.511111
6wHQsN5g2RM.000.mp4,0.457944,0.489583,0.505495,0.398058,0.457944,0.377778
dQOeQYWIgm8.000.mp4,0.607477,0.489583,0.406593,0.621359,0.570093,0.622222


Index sample: ['J4GQm9j0JZ0.003.mp4', 'zEyRyTnIw5I.005.mp4', 'nskJh7v6v1U.004.mp4', '6wHQsN5g2RM.000.mp4', 'dQOeQYWIgm8.000.mp4', 'eHcRre1YsNA.000.mp4', 'vZpneJlniAE.005.mp4', 'oANKg9_grdA.004.mp4', 'VuadgOz6T7s.000.mp4', '7nhJXn9PI0I.001.mp4']
Columns sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']


In [5]:
inspect_annotation("Val")


===== Val =====
annotation dir: e:\tugas-akhir-qiqi\Dataset\Val\annotation
PKL found: ['annotation_validation.pkl']
Using PKL: annotation_validation.pkl | size: 261721
Loaded type: <class 'dict'>
Dict keys sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']
First value type: <class 'dict'>
pd.DataFrame(obj) shape: (2000, 6)


Unnamed: 0,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness
modNfUPt3F4.002.mp4,0.64486,0.59375,0.615385,0.640777,0.616822,0.555556
h6LOjpCRXtY.005.mp4,0.439252,0.520833,0.417582,0.572816,0.439252,0.411111
WER4ww680QQ.004.mp4,0.457944,0.3125,0.428571,0.398058,0.373832,0.555556
c4XnKouozXU.002.mp4,0.364486,0.572917,0.527473,0.553398,0.523364,0.322222
OEKg-Tvwcbk.002.mp4,0.345794,0.46875,0.516484,0.417476,0.383178,0.477778


Index sample: ['modNfUPt3F4.002.mp4', 'h6LOjpCRXtY.005.mp4', 'WER4ww680QQ.004.mp4', 'c4XnKouozXU.002.mp4', 'OEKg-Tvwcbk.002.mp4', 'PtA7yAu9-VE.003.mp4', 'TmpP2fXeVtk.004.mp4', '1uC-2TZqplE.002.mp4', '_01AyUz9J9I.003.mp4', '_RfHkyf68Zs.000.mp4']
Columns sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']


In [6]:
inspect_annotation("Test")


===== Test =====
annotation dir: e:\tugas-akhir-qiqi\Dataset\Test\annotation
PKL found: ['annotation_test.pkl']
Using PKL: annotation_test.pkl | size: 261721
Loaded type: <class 'dict'>
Dict keys sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']
First value type: <class 'dict'>
pd.DataFrame(obj) shape: (2000, 6)


Unnamed: 0,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness
htH89DBizno.004.mp4,0.485981,0.645833,0.681319,0.669903,0.626168,0.822222
p_wf-KszNlk.001.mp4,0.616822,0.59375,0.692308,0.514563,0.570093,0.655556
MuYYY3XaJ7Q.001.mp4,0.46729,0.625,0.56044,0.524272,0.514019,0.522222
0MB91ku0eEw.005.mp4,0.411215,0.458333,0.714286,0.660194,0.570093,0.4
WpEZOSrENL0.003.mp4,0.317757,0.4375,0.384615,0.524272,0.448598,0.411111


Index sample: ['htH89DBizno.004.mp4', 'p_wf-KszNlk.001.mp4', 'MuYYY3XaJ7Q.001.mp4', '0MB91ku0eEw.005.mp4', 'WpEZOSrENL0.003.mp4', 'C2Y9Puk3Obk.004.mp4', 'ask-ZFRztf8.003.mp4', 'TSGpD2NBeCQ.005.mp4', '54JawR1x0II.004.mp4', '9n8dNi-ERQ0.001.mp4']
Columns sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']


# **Cek Leakage**

In [8]:
from pathlib import Path

def group_id_from_clip_id(clip_id: str) -> str:
    # contoh: "--Ymqszjv54.001" -> "--Ymqszjv54"
    return str(clip_id).split(".")[0]

def get_clip_ids_from_audio(audio_dir: Path):
    return {p.stem for p in audio_dir.glob("*.wav")}

def pct(a, b):
    return (a / b * 100) if b else 0.0

def report_leakage():
    train_ids = get_clip_ids_from_audio(SPLITS["Train"]["audio_dir"])
    val_ids   = get_clip_ids_from_audio(SPLITS["Val"]["audio_dir"])
    test_ids  = get_clip_ids_from_audio(SPLITS["Test"]["audio_dir"])

    G_train = {group_id_from_clip_id(x) for x in train_ids}
    G_val   = {group_id_from_clip_id(x) for x in val_ids}
    G_test  = {group_id_from_clip_id(x) for x in test_ids}

    # overlaps
    ov_tv = G_train & G_val
    ov_tt = G_train & G_test
    ov_vt = G_val & G_test
    ov_any_test = (G_train | G_val) & G_test

    lines = []
    lines.append("=== Leakage Check (Group-level) ===")
    lines.append("Group ID definition: group_id = clip_id.split('.')[0]  (≈ YouTubeID)")
    lines.append("")
    lines.append("1) Official split sizes (by extracted audio):")
    lines.append(f"   - Train clips: {len(train_ids):,} | unique groups: {len(G_train):,}")
    lines.append(f"   - Val   clips: {len(val_ids):,} | unique groups: {len(G_val):,}")
    lines.append(f"   - Test  clips: {len(test_ids):,} | unique groups: {len(G_test):,}")
    lines.append("")
    lines.append("2) Group overlaps (potential identity leakage):")
    lines.append(f"   - Train ∩ Val : {len(ov_tv):,} groups ({pct(len(ov_tv), len(G_val)):.2f}% of Val groups)")
    lines.append(f"   - Train ∩ Test: {len(ov_tt):,} groups ({pct(len(ov_tt), len(G_test)):.2f}% of Test groups)")
    lines.append(f"   - Val   ∩ Test: {len(ov_vt):,} groups ({pct(len(ov_vt), len(G_test)):.2f}% of Test groups)")
    lines.append("")
    lines.append("3) Test leakage from Train/Val combined:")
    lines.append(f"   - (Train ∪ Val) ∩ Test: {len(ov_any_test):,} groups ({pct(len(ov_any_test), len(G_test)):.2f}% of Test groups)")
    lines.append("")

    # judgement
    if len(ov_any_test) == 0:
        verdict = "No group overlap detected between (Train ∪ Val) and Test (clean at group-level)."
    else:
        verdict = "Group overlap detected between (Train ∪ Val) and Test (identity leakage risk)."
    lines.append("4) Verdict:")
    lines.append(f"   - {verdict}")

    # sample overlaps
    def sample_list(s, n=10):
        return ", ".join(list(sorted(s))[:n]) if s else "-"

    lines.append("")
    lines.append("5) Sample overlapping group IDs:")
    lines.append(f"   - Train∩Test sample: {sample_list(ov_tt)}")
    lines.append(f"   - Val∩Test   sample: {sample_list(ov_vt)}")

    print("\n".join(lines))

report_leakage()


=== Leakage Check (Group-level) ===
Group ID definition: group_id = clip_id.split('.')[0]  (≈ YouTubeID)

1) Official split sizes (by extracted audio):
   - Train clips: 6,000 | unique groups: 2,624
   - Val   clips: 2,000 | unique groups: 1,484
   - Test  clips: 2,000 | unique groups: 1,455

2) Group overlaps (potential identity leakage):
   - Train ∩ Val : 1,222 groups (82.35% of Val groups)
   - Train ∩ Test: 1,201 groups (82.54% of Test groups)
   - Val   ∩ Test: 689 groups (47.35% of Test groups)

3) Test leakage from Train/Val combined:
   - (Train ∪ Val) ∩ Test: 1,281 groups (88.04% of Test groups)

4) Verdict:
   - Group overlap detected between (Train ∪ Val) and Test (identity leakage risk).

5) Sample overlapping group IDs:
   - Train∩Test sample: --Ymqszjv54, -6otZ7M-Mro, -8asrRvfJWA, -DOqN0d8KHw, -Gl98Jn45Fs, -N6QKrbnaDs, -NwfYYf5xLo, -R2SZu3SYgM, -VTqcHNgH7M, -Wqk9eex6bQ
   - Val∩Test   sample: -6otZ7M-Mro, -8asrRvfJWA, -DOqN0d8KHw, -N6QKrbnaDs, -PWjgx2czwY, -R2SZu3SYgM, -

# **Build Metadata**

Akan ada 4 metadata output (Dengan gabungan EnG, anotasi official, dan Age)
- meta_train_official.csv
- meta_val_official.csv
- meta_test_official.csv
- meta_master.csv (akan digunakan di split strict, yang mana ini gabungan dari train val dan test, dengan tambahan kolom source official (train, test, val)) 

In [12]:
OUT_DIR = ROOT / "output" / "preprocessing"
OUT_DIR.mkdir(parents=True, exist_ok=True)

def norm_clip(x: str) -> str:
    return Path(str(x)).stem  # "--Ymqszjv54.001.mp4" -> "--Ymqszjv54.001"

def group_id_from_clip_id(clip_id: str) -> str:
    return str(clip_id).split(".")[0]  # "--Ymqszjv54.001" -> "--Ymqszjv54"

In [13]:
# EnG Anotation (csv)
EnG_dev_csv = DATASET / "Eth_gender_annotation" / "eth_gender_annotations_dev.csv"
EnG_test_csv = DATASET / "Eth_gender_annotation" / "eth_gender_annotations_test.csv"

# Age Anotation (csv)
Age_dev_csv = DATASET / "Age" / "age_anno_dev.csv"
Age_test_csv = DATASET / "Age" / "age_anno_test.csv"

EnG_dev_df = pd.read_csv(EnG_dev_csv, sep=';')
EnG_test_df = pd.read_csv(EnG_test_csv, sep=';')
Age_dev_df = pd.read_csv(Age_dev_csv)
Age_test_df = pd.read_csv(Age_test_csv)

print("EnG_dev:", EnG_dev_df.shape, "| EnG_test:", EnG_test_df.shape)
print("Age_dev:", Age_dev_df.shape, "| Age_test:", Age_test_df.shape)


EnG_dev: (8000, 4) | EnG_test: (2000, 4)
Age_dev: (8000, 3) | Age_test: (2000, 3)


In [14]:
for df in [EnG_dev_df, EnG_test_df, Age_dev_df, Age_test_df]:
    df["clip_id"] = df["VideoName"].map(norm_clip)

meta_dev = EnG_dev_df.merge(Age_dev_df[["clip_id", "AgeGroup"]], on="clip_id", how="left")
meta_test = EnG_test_df.merge(Age_test_df[["clip_id", "AgeGroup"]], on="clip_id", how="left")

# tambah group_id
meta_dev["group_id"] = meta_dev["clip_id"].map(group_id_from_clip_id)
meta_test["group_id"] = meta_test["clip_id"].map(group_id_from_clip_id)

print("meta_dev:", meta_dev.shape)
print("meta_test:", meta_test.shape)
display(meta_dev.head())


meta_dev: (8000, 7)
meta_test: (2000, 7)


Unnamed: 0,VideoName,YouTubeID,Ethnicity,Gender,clip_id,AgeGroup,group_id
0,--Ymqszjv54.001.mp4,--Ymqszjv54,2,1,--Ymqszjv54.001,5,--Ymqszjv54
1,--Ymqszjv54.003.mp4,--Ymqszjv54,2,1,--Ymqszjv54.003,5,--Ymqszjv54
2,--Ymqszjv54.004.mp4,--Ymqszjv54,2,1,--Ymqszjv54.004,5,--Ymqszjv54
3,--Ymqszjv54.005.mp4,--Ymqszjv54,2,1,--Ymqszjv54.005,5,--Ymqszjv54
4,-2qsCrkXdWs.001.mp4,-2qsCrkXdWs,2,1,-2qsCrkXdWs.001,2,-2qsCrkXdWs


In [15]:
def load_pkl(path: Path):
    with open(path, "rb") as f:
        try:
            return pickle.load(f)
        except UnicodeDecodeError:
            f.seek(0)
            return pickle.load(f, encoding="latin1")

def load_bigfive_from_split(split_name: str) -> pd.DataFrame:
    ann_dir = SPLITS[split_name]["split_root"] / "annotation"
    pkls = sorted(list(ann_dir.glob("*.pkl")) + list(ann_dir.glob("*.pickle")))
    if not pkls:
        raise FileNotFoundError(f"Tidak ada PKL di: {ann_dir}")

    pkl_path = max(pkls, key=lambda p: p.stat().st_size)
    obj = load_pkl(pkl_path)

    if not isinstance(obj, dict):
        raise ValueError(f"Format PKL bukan dict: {type(obj)} (file: {pkl_path.name})")

    # dict-of-dict (trait -> {clip: value}) -> DataFrame: rows=clip, cols=trait
    df = pd.DataFrame(obj)

    # index biasanya nama file clip (kadang ada .mp4)
    df = df.reset_index().rename(columns={"index": "clip_id"})
    df["clip_id"] = df["clip_id"].astype(str).map(norm_clip)

    # keep 5 trait utama (+ interview kalau ada)
    keep = ["clip_id", "extraversion", "neuroticism", "agreeableness", "conscientiousness", "openness"]
    if "interview" in df.columns:
        keep.append("interview")
    for c in keep:
        if c not in df.columns:
            df[c] = np.nan

    # numeric
    for c in ["extraversion","neuroticism","agreeableness","conscientiousness","openness","interview"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    df["group_id"] = df["clip_id"].map(group_id_from_clip_id)
    df.attrs["source"] = str(pkl_path)
    return df[keep + ["group_id"]]

big_train = load_bigfive_from_split("Train")
big_val   = load_bigfive_from_split("Val")
big_test  = load_bigfive_from_split("Test")

print("BigFive Train:", big_train.shape, "| src:", big_train.attrs.get("source"))
print("BigFive Val  :", big_val.shape,   "| src:", big_val.attrs.get("source"))
print("BigFive Test :", big_test.shape,  "| src:", big_test.attrs.get("source"))
display(big_train.head())


BigFive Train: (6000, 8) | src: e:\tugas-akhir-qiqi\Dataset\Train\annotation\annotation_training.pkl
BigFive Val  : (2000, 8) | src: e:\tugas-akhir-qiqi\Dataset\Val\annotation\annotation_validation.pkl
BigFive Test : (2000, 8) | src: e:\tugas-akhir-qiqi\Dataset\Test\annotation\annotation_test.pkl


Unnamed: 0,clip_id,extraversion,neuroticism,agreeableness,conscientiousness,openness,interview,group_id
0,J4GQm9j0JZ0.003,0.523364,0.552083,0.626374,0.601942,0.488889,0.504673,J4GQm9j0JZ0
1,zEyRyTnIw5I.005,0.345794,0.375,0.472527,0.582524,0.366667,0.457944,zEyRyTnIw5I
2,nskJh7v6v1U.004,0.252336,0.291667,0.406593,0.485437,0.511111,0.373832,nskJh7v6v1U
3,6wHQsN5g2RM.000,0.457944,0.489583,0.505495,0.398058,0.377778,0.457944,6wHQsN5g2RM
4,dQOeQYWIgm8.000,0.607477,0.489583,0.406593,0.621359,0.622222,0.570093,dQOeQYWIgm8


In [16]:
def audio_clip_ids(audio_dir: Path) -> set[str]:
    return {p.stem for p in audio_dir.glob("*.wav")}

train_ids = audio_clip_ids(SPLITS["Train"]["audio_dir"])
val_ids   = audio_clip_ids(SPLITS["Val"]["audio_dir"])
test_ids  = audio_clip_ids(SPLITS["Test"]["audio_dir"])

print("audio counts:", len(train_ids), len(val_ids), len(test_ids))


audio counts: 6000 2000 2000


In [22]:
def build_official_meta(split_name: str) -> pd.DataFrame:
    if split_name == "Train":
        ids = train_ids
        demo = meta_dev
        big = big_train
        split_off = "train"
    elif split_name == "Val":
        ids = val_ids
        demo = meta_dev
        big = big_val
        split_off = "val"
    else:
        ids = test_ids
        demo = meta_test
        big = big_test
        split_off = "test"

    demo_s = demo[demo["clip_id"].isin(ids)].copy()
    big_s  = big[big["clip_id"].isin(ids)].copy()

    # merge demo + bigfive
    out = demo_s.merge(big_s.drop(columns=["group_id"]), on="clip_id", how="left", suffixes=("", "_bf"))
    out["split_official"] = split_off
    out["group_id"] = out["clip_id"].map(group_id_from_clip_id)

    # rapihin kolom (optional)
    prefer_cols = [
        "group_id","clip_id","split_official",
        "VideoName","Ethnicity","Gender","AgeGroup",
        "extraversion","neuroticism","agreeableness","conscientiousness","openness"
    ]
    if "interview" in out.columns:
        prefer_cols.append("interview")

    cols = [c for c in prefer_cols if c in out.columns] + [c for c in out.columns if c not in prefer_cols]
    out = out[cols]
    out = out.drop(columns=["YouTubeID"], errors="ignore")
    return out

meta_train_off = build_official_meta("Train")
meta_val_off   = build_official_meta("Val")
meta_test_off  = build_official_meta("Test")

# meta_master = gabungan train+val+test (10k) untuk strict split
meta_master = pd.concat([meta_train_off, meta_val_off, meta_test_off], ignore_index=True)

# sanity checks
print("meta_train_off:", meta_train_off.shape)
print("meta_val_off  :", meta_val_off.shape)
print("meta_test_off :", meta_test_off.shape)
print("meta_master   :", meta_master.shape)

# pastikan unique clip_id
dup = meta_master["clip_id"].duplicated().sum()
print("duplicated clip_id in meta_master:", dup)

# save
(meta_train_off).to_csv(OUT_DIR / "meta_train_official.csv", index=False)
(meta_val_off).to_csv(OUT_DIR / "meta_val_official.csv", index=False)
(meta_test_off).to_csv(OUT_DIR / "meta_test_official.csv", index=False)
(meta_master).to_csv(OUT_DIR / "meta_master.csv", index=False)

print("\nSaved to:", OUT_DIR.resolve())


meta_train_off: (6000, 13)
meta_val_off  : (2000, 13)
meta_test_off : (2000, 13)
meta_master   : (10000, 13)
duplicated clip_id in meta_master: 0

Saved to: E:\tugas-akhir-qiqi\output\preprocessing
