# **PART 1 - Build dan Init Metadata**

## **Imports + Config**

In [4]:
from pathlib import Path
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm
import hashlib
import warnings

TARGET_SR = 16000
TARGET_SEC = 15.0
MIN_SEC = 1.0
MIN_SPEECH_SEC = 2.0

warnings.filterwarnings("ignore")


## **Path split**

In [6]:
ROOT = Path.cwd().parent
DATASET = ROOT / "Dataset"

SPLITS = {
    "Train": {
        "split_root": DATASET / "Train",
        "audio_dir":  DATASET / "Train" / "train" / "audio",
    },
    "Val": {
        "split_root": DATASET / "Val",
        "audio_dir":  DATASET / "Val" / "val" / "audio",
    },
    "Test": {
        "split_root": DATASET / "Test",
        "audio_dir":  DATASET / "Test" / "test" / "audio",
    },
}

for k,v in SPLITS.items():
    print(k, "audio_dir exists?", v["audio_dir"].exists(), "->", v["audio_dir"])


Train audio_dir exists? True -> e:\tugas-akhir-qiqi\Dataset\Train\train\audio
Val audio_dir exists? True -> e:\tugas-akhir-qiqi\Dataset\Val\val\audio
Test audio_dir exists? True -> e:\tugas-akhir-qiqi\Dataset\Test\test\audio


## **Cek metadata**

In [3]:
#metadata etnichity & gender (csv)
EnG_dev_csv = DATASET / "Eth_gender_annotation" / "eth_gender_annotations_dev.csv"
EnG_test_csv = DATASET / "Eth_gender_annotation" / "eth_gender_annotations_test.csv"

#Age Anotation (csv)
Age_dev_csv = DATASET / "Age" / "age_anno_dev.csv"
Age_test_csv = DATASET / "Age" / "age_anno_test.csv"

## load metadata csv and print
EnG_dev_df = pd.read_csv(EnG_dev_csv, sep=';')
EnG_test_df = pd.read_csv(EnG_test_csv, sep=';')
Age_dev_df = pd.read_csv(Age_dev_csv)
Age_test_df = pd.read_csv(Age_test_csv)
print("EnG_dev_df shape:", EnG_dev_df.shape)
print("EnG_test_df shape:", EnG_test_df.shape)
print("Age_dev_df shape:", Age_dev_df.shape)
print("Age_test_df shape:", Age_test_df.shape)
print("\n")

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print("EnG_dev_df head:\n\n", EnG_dev_df.head())
    print("EnG_test_df head:\n\n", EnG_test_df.head())
    print("Age_dev_df head:\n\n", Age_dev_df.head())
    print("Age_test_df head:\n\n", Age_test_df.head())

EnG_dev_df shape: (8000, 4)
EnG_test_df shape: (2000, 4)
Age_dev_df shape: (8000, 3)
Age_test_df shape: (2000, 3)


EnG_dev_df head:

              VideoName    YouTubeID  Ethnicity  Gender
0  --Ymqszjv54.001.mp4  --Ymqszjv54          2       1
1  --Ymqszjv54.003.mp4  --Ymqszjv54          2       1
2  --Ymqszjv54.004.mp4  --Ymqszjv54          2       1
3  --Ymqszjv54.005.mp4  --Ymqszjv54          2       1
4  -2qsCrkXdWs.001.mp4  -2qsCrkXdWs          2       1
EnG_test_df head:

              VideoName    YouTubeID  Ethnicity  Gender
0  --Ymqszjv54.000.mp4  --Ymqszjv54          2       1
1  -10-QQDO_ME.001.mp4  -10-QQDO_ME          2       2
2  -10-QQDO_ME.002.mp4  -10-QQDO_ME          2       2
3  -10-QQDO_ME.005.mp4  -10-QQDO_ME          2       2
4  -4J4xkfN5cI.000.mp4  -4J4xkfN5cI          2       2
Age_dev_df head:

              VideoName    YouTubeID  AgeGroup
0  --Ymqszjv54.001.mp4  --Ymqszjv54         5
1  --Ymqszjv54.003.mp4  --Ymqszjv54         5
2  --Ymqszjv54.004.mp4  --Ym

Karena ada dev dan test yang ada di EnG dan juga di Age csv, maka keputusan yaitu merge jadi 2 metadata, yaitu dev dan test dan digabungin sekalian dengan Big 5 yang ada di anotasi tiap split.


In [4]:
import pickle

def load_pkl(path: Path):
    with open(path, "rb") as f:
        try:
            return pickle.load(f)
        except UnicodeDecodeError:
            f.seek(0)
            return pickle.load(f, encoding="latin1")

def inspect_annotation(split_name: str):
    ann_dir = SPLITS[split_name]["split_root"] / "annotation"
    print(f"\n===== {split_name} =====")
    print("annotation dir:", ann_dir)

    if not ann_dir.exists():
        print("-> annotation folder tidak ada")
        return

    pkls = sorted(list(ann_dir.glob("*.pkl")) + list(ann_dir.glob("*.pickle")))
    print("PKL found:", [p.name for p in pkls])

    if not pkls:
        print("-> tidak ada pkl")
        return

    # pilih pkl terbesar (biasanya utama)
    pkl_path = max(pkls, key=lambda p: p.stat().st_size)
    print("Using PKL:", pkl_path.name, "| size:", pkl_path.stat().st_size)

    obj = load_pkl(pkl_path)
    print("Loaded type:", type(obj))

    # kalau sudah DataFrame
    if isinstance(obj, pd.DataFrame):
        df = obj
        print("Already DataFrame. shape:", df.shape)
        display(df.head())
        return

    # kalau dict
    if isinstance(obj, dict):
        keys = list(obj.keys())
        print("Dict keys sample:", keys[:10])
        first_key = keys[0] if keys else None
        first_val = obj[first_key] if first_key is not None else None
        print("First value type:", type(first_val))

        # Coba bikin DataFrame (akan benar kalau dict-of-dict atau dict-of-list)
        try:
            df = pd.DataFrame(obj)
            print("pd.DataFrame(obj) shape:", df.shape)
            display(df.head())
            print("Index sample:", list(df.index)[:10])
            print("Columns sample:", list(df.columns)[:10])

            # indikasi butuh transpose:
            # - kalau index itu trait (extraversion, openness, ...) dan columns itu video, maka harus transpose
            return
        except Exception as e:
            print("pd.DataFrame(obj) failed:", e)
            return

    # kalau list
    if isinstance(obj, list):
        print("List len:", len(obj))
        print("First elem type:", type(obj[0]) if obj else None)
        try:
            df = pd.DataFrame(obj)
            print("pd.DataFrame(list) shape:", df.shape)
            display(df.head())
        except Exception as e:
            print("pd.DataFrame(list) failed:", e)
        return

    # tipe lain
    print("Unhandled type; coba print repr pendek:")
    print(repr(obj)[:500])

# Jalankan satu-satu
inspect_annotation("Train")



===== Train =====
annotation dir: e:\tugas-akhir-qiqi\Dataset\Train\annotation
PKL found: ['annotation_training.pkl']
Using PKL: annotation_training.pkl | size: 793769
Loaded type: <class 'dict'>
Dict keys sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']
First value type: <class 'dict'>
pd.DataFrame(obj) shape: (6000, 6)


Unnamed: 0,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness
J4GQm9j0JZ0.003.mp4,0.523364,0.552083,0.626374,0.601942,0.504673,0.488889
zEyRyTnIw5I.005.mp4,0.345794,0.375,0.472527,0.582524,0.457944,0.366667
nskJh7v6v1U.004.mp4,0.252336,0.291667,0.406593,0.485437,0.373832,0.511111
6wHQsN5g2RM.000.mp4,0.457944,0.489583,0.505495,0.398058,0.457944,0.377778
dQOeQYWIgm8.000.mp4,0.607477,0.489583,0.406593,0.621359,0.570093,0.622222


Index sample: ['J4GQm9j0JZ0.003.mp4', 'zEyRyTnIw5I.005.mp4', 'nskJh7v6v1U.004.mp4', '6wHQsN5g2RM.000.mp4', 'dQOeQYWIgm8.000.mp4', 'eHcRre1YsNA.000.mp4', 'vZpneJlniAE.005.mp4', 'oANKg9_grdA.004.mp4', 'VuadgOz6T7s.000.mp4', '7nhJXn9PI0I.001.mp4']
Columns sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']


In [5]:
inspect_annotation("Val")


===== Val =====
annotation dir: e:\tugas-akhir-qiqi\Dataset\Val\annotation
PKL found: ['annotation_validation.pkl']
Using PKL: annotation_validation.pkl | size: 261721
Loaded type: <class 'dict'>
Dict keys sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']
First value type: <class 'dict'>
pd.DataFrame(obj) shape: (2000, 6)


Unnamed: 0,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness
modNfUPt3F4.002.mp4,0.64486,0.59375,0.615385,0.640777,0.616822,0.555556
h6LOjpCRXtY.005.mp4,0.439252,0.520833,0.417582,0.572816,0.439252,0.411111
WER4ww680QQ.004.mp4,0.457944,0.3125,0.428571,0.398058,0.373832,0.555556
c4XnKouozXU.002.mp4,0.364486,0.572917,0.527473,0.553398,0.523364,0.322222
OEKg-Tvwcbk.002.mp4,0.345794,0.46875,0.516484,0.417476,0.383178,0.477778


Index sample: ['modNfUPt3F4.002.mp4', 'h6LOjpCRXtY.005.mp4', 'WER4ww680QQ.004.mp4', 'c4XnKouozXU.002.mp4', 'OEKg-Tvwcbk.002.mp4', 'PtA7yAu9-VE.003.mp4', 'TmpP2fXeVtk.004.mp4', '1uC-2TZqplE.002.mp4', '_01AyUz9J9I.003.mp4', '_RfHkyf68Zs.000.mp4']
Columns sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']


In [6]:
inspect_annotation("Test")


===== Test =====
annotation dir: e:\tugas-akhir-qiqi\Dataset\Test\annotation
PKL found: ['annotation_test.pkl']
Using PKL: annotation_test.pkl | size: 261721
Loaded type: <class 'dict'>
Dict keys sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']
First value type: <class 'dict'>
pd.DataFrame(obj) shape: (2000, 6)


Unnamed: 0,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness
htH89DBizno.004.mp4,0.485981,0.645833,0.681319,0.669903,0.626168,0.822222
p_wf-KszNlk.001.mp4,0.616822,0.59375,0.692308,0.514563,0.570093,0.655556
MuYYY3XaJ7Q.001.mp4,0.46729,0.625,0.56044,0.524272,0.514019,0.522222
0MB91ku0eEw.005.mp4,0.411215,0.458333,0.714286,0.660194,0.570093,0.4
WpEZOSrENL0.003.mp4,0.317757,0.4375,0.384615,0.524272,0.448598,0.411111


Index sample: ['htH89DBizno.004.mp4', 'p_wf-KszNlk.001.mp4', 'MuYYY3XaJ7Q.001.mp4', '0MB91ku0eEw.005.mp4', 'WpEZOSrENL0.003.mp4', 'C2Y9Puk3Obk.004.mp4', 'ask-ZFRztf8.003.mp4', 'TSGpD2NBeCQ.005.mp4', '54JawR1x0II.004.mp4', '9n8dNi-ERQ0.001.mp4']
Columns sample: ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']


## **Cek Leakage**

In [8]:
from pathlib import Path

def group_id_from_clip_id(clip_id: str) -> str:
    # contoh: "--Ymqszjv54.001" -> "--Ymqszjv54"
    return str(clip_id).split(".")[0]

def get_clip_ids_from_audio(audio_dir: Path):
    return {p.stem for p in audio_dir.glob("*.wav")}

def pct(a, b):
    return (a / b * 100) if b else 0.0

def report_leakage():
    train_ids = get_clip_ids_from_audio(SPLITS["Train"]["audio_dir"])
    val_ids   = get_clip_ids_from_audio(SPLITS["Val"]["audio_dir"])
    test_ids  = get_clip_ids_from_audio(SPLITS["Test"]["audio_dir"])

    G_train = {group_id_from_clip_id(x) for x in train_ids}
    G_val   = {group_id_from_clip_id(x) for x in val_ids}
    G_test  = {group_id_from_clip_id(x) for x in test_ids}

    # overlaps
    ov_tv = G_train & G_val
    ov_tt = G_train & G_test
    ov_vt = G_val & G_test
    ov_any_test = (G_train | G_val) & G_test

    lines = []
    lines.append("=== Leakage Check (Group-level) ===")
    lines.append("Group ID definition: group_id = clip_id.split('.')[0]  (≈ YouTubeID)")
    lines.append("")
    lines.append("1) Official split sizes (by extracted audio):")
    lines.append(f"   - Train clips: {len(train_ids):,} | unique groups: {len(G_train):,}")
    lines.append(f"   - Val   clips: {len(val_ids):,} | unique groups: {len(G_val):,}")
    lines.append(f"   - Test  clips: {len(test_ids):,} | unique groups: {len(G_test):,}")
    lines.append("")
    lines.append("2) Group overlaps (potential identity leakage):")
    lines.append(f"   - Train ∩ Val : {len(ov_tv):,} groups ({pct(len(ov_tv), len(G_val)):.2f}% of Val groups)")
    lines.append(f"   - Train ∩ Test: {len(ov_tt):,} groups ({pct(len(ov_tt), len(G_test)):.2f}% of Test groups)")
    lines.append(f"   - Val   ∩ Test: {len(ov_vt):,} groups ({pct(len(ov_vt), len(G_test)):.2f}% of Test groups)")
    lines.append("")
    lines.append("3) Test leakage from Train/Val combined:")
    lines.append(f"   - (Train ∪ Val) ∩ Test: {len(ov_any_test):,} groups ({pct(len(ov_any_test), len(G_test)):.2f}% of Test groups)")
    lines.append("")

    # judgement
    if len(ov_any_test) == 0:
        verdict = "No group overlap detected between (Train ∪ Val) and Test (clean at group-level)."
    else:
        verdict = "Group overlap detected between (Train ∪ Val) and Test (identity leakage risk)."
    lines.append("4) Verdict:")
    lines.append(f"   - {verdict}")

    # sample overlaps
    def sample_list(s, n=10):
        return ", ".join(list(sorted(s))[:n]) if s else "-"

    lines.append("")
    lines.append("5) Sample overlapping group IDs:")
    lines.append(f"   - Train∩Test sample: {sample_list(ov_tt)}")
    lines.append(f"   - Val∩Test   sample: {sample_list(ov_vt)}")

    print("\n".join(lines))

report_leakage()


=== Leakage Check (Group-level) ===
Group ID definition: group_id = clip_id.split('.')[0]  (≈ YouTubeID)

1) Official split sizes (by extracted audio):
   - Train clips: 6,000 | unique groups: 2,624
   - Val   clips: 2,000 | unique groups: 1,484
   - Test  clips: 2,000 | unique groups: 1,455

2) Group overlaps (potential identity leakage):
   - Train ∩ Val : 1,222 groups (82.35% of Val groups)
   - Train ∩ Test: 1,201 groups (82.54% of Test groups)
   - Val   ∩ Test: 689 groups (47.35% of Test groups)

3) Test leakage from Train/Val combined:
   - (Train ∪ Val) ∩ Test: 1,281 groups (88.04% of Test groups)

4) Verdict:
   - Group overlap detected between (Train ∪ Val) and Test (identity leakage risk).

5) Sample overlapping group IDs:
   - Train∩Test sample: --Ymqszjv54, -6otZ7M-Mro, -8asrRvfJWA, -DOqN0d8KHw, -Gl98Jn45Fs, -N6QKrbnaDs, -NwfYYf5xLo, -R2SZu3SYgM, -VTqcHNgH7M, -Wqk9eex6bQ
   - Val∩Test   sample: -6otZ7M-Mro, -8asrRvfJWA, -DOqN0d8KHw, -N6QKrbnaDs, -PWjgx2czwY, -R2SZu3SYgM, -

## **Build Metadata**

Akan ada 4 metadata output (Dengan gabungan EnG, anotasi official, dan Age)
- meta_train_official.csv
- meta_val_official.csv
- meta_test_official.csv
- meta_master.csv (akan digunakan di split strict, yang mana ini gabungan dari train val dan test, dengan tambahan kolom source official (train, test, val)) 

In [7]:
OUT_DIR = ROOT / "output" / "preprocessing"
OUT_DIR.mkdir(parents=True, exist_ok=True)

def norm_clip(x: str) -> str:
    return Path(str(x)).stem  # "--Ymqszjv54.001.mp4" -> "--Ymqszjv54.001"

def group_id_from_clip_id(clip_id: str) -> str:
    return str(clip_id).split(".")[0]  # "--Ymqszjv54.001" -> "--Ymqszjv54"

In [13]:
# EnG Anotation (csv)
EnG_dev_csv = DATASET / "Eth_gender_annotation" / "eth_gender_annotations_dev.csv"
EnG_test_csv = DATASET / "Eth_gender_annotation" / "eth_gender_annotations_test.csv"

# Age Anotation (csv)
Age_dev_csv = DATASET / "Age" / "age_anno_dev.csv"
Age_test_csv = DATASET / "Age" / "age_anno_test.csv"

EnG_dev_df = pd.read_csv(EnG_dev_csv, sep=';')
EnG_test_df = pd.read_csv(EnG_test_csv, sep=';')
Age_dev_df = pd.read_csv(Age_dev_csv)
Age_test_df = pd.read_csv(Age_test_csv)

print("EnG_dev:", EnG_dev_df.shape, "| EnG_test:", EnG_test_df.shape)
print("Age_dev:", Age_dev_df.shape, "| Age_test:", Age_test_df.shape)


EnG_dev: (8000, 4) | EnG_test: (2000, 4)
Age_dev: (8000, 3) | Age_test: (2000, 3)


In [14]:
for df in [EnG_dev_df, EnG_test_df, Age_dev_df, Age_test_df]:
    df["clip_id"] = df["VideoName"].map(norm_clip)

meta_dev = EnG_dev_df.merge(Age_dev_df[["clip_id", "AgeGroup"]], on="clip_id", how="left")
meta_test = EnG_test_df.merge(Age_test_df[["clip_id", "AgeGroup"]], on="clip_id", how="left")

# tambah group_id
meta_dev["group_id"] = meta_dev["clip_id"].map(group_id_from_clip_id)
meta_test["group_id"] = meta_test["clip_id"].map(group_id_from_clip_id)

print("meta_dev:", meta_dev.shape)
print("meta_test:", meta_test.shape)
display(meta_dev.head())


meta_dev: (8000, 7)
meta_test: (2000, 7)


Unnamed: 0,VideoName,YouTubeID,Ethnicity,Gender,clip_id,AgeGroup,group_id
0,--Ymqszjv54.001.mp4,--Ymqszjv54,2,1,--Ymqszjv54.001,5,--Ymqszjv54
1,--Ymqszjv54.003.mp4,--Ymqszjv54,2,1,--Ymqszjv54.003,5,--Ymqszjv54
2,--Ymqszjv54.004.mp4,--Ymqszjv54,2,1,--Ymqszjv54.004,5,--Ymqszjv54
3,--Ymqszjv54.005.mp4,--Ymqszjv54,2,1,--Ymqszjv54.005,5,--Ymqszjv54
4,-2qsCrkXdWs.001.mp4,-2qsCrkXdWs,2,1,-2qsCrkXdWs.001,2,-2qsCrkXdWs


In [15]:
def load_pkl(path: Path):
    with open(path, "rb") as f:
        try:
            return pickle.load(f)
        except UnicodeDecodeError:
            f.seek(0)
            return pickle.load(f, encoding="latin1")

def load_bigfive_from_split(split_name: str) -> pd.DataFrame:
    ann_dir = SPLITS[split_name]["split_root"] / "annotation"
    pkls = sorted(list(ann_dir.glob("*.pkl")) + list(ann_dir.glob("*.pickle")))
    if not pkls:
        raise FileNotFoundError(f"Tidak ada PKL di: {ann_dir}")

    pkl_path = max(pkls, key=lambda p: p.stat().st_size)
    obj = load_pkl(pkl_path)

    if not isinstance(obj, dict):
        raise ValueError(f"Format PKL bukan dict: {type(obj)} (file: {pkl_path.name})")

    # dict-of-dict (trait -> {clip: value}) -> DataFrame: rows=clip, cols=trait
    df = pd.DataFrame(obj)

    # index biasanya nama file clip (kadang ada .mp4)
    df = df.reset_index().rename(columns={"index": "clip_id"})
    df["clip_id"] = df["clip_id"].astype(str).map(norm_clip)

    # keep 5 trait utama (+ interview kalau ada)
    keep = ["clip_id", "extraversion", "neuroticism", "agreeableness", "conscientiousness", "openness"]
    if "interview" in df.columns:
        keep.append("interview")
    for c in keep:
        if c not in df.columns:
            df[c] = np.nan

    # numeric
    for c in ["extraversion","neuroticism","agreeableness","conscientiousness","openness","interview"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    df["group_id"] = df["clip_id"].map(group_id_from_clip_id)
    df.attrs["source"] = str(pkl_path)
    return df[keep + ["group_id"]]

big_train = load_bigfive_from_split("Train")
big_val   = load_bigfive_from_split("Val")
big_test  = load_bigfive_from_split("Test")

print("BigFive Train:", big_train.shape, "| src:", big_train.attrs.get("source"))
print("BigFive Val  :", big_val.shape,   "| src:", big_val.attrs.get("source"))
print("BigFive Test :", big_test.shape,  "| src:", big_test.attrs.get("source"))
display(big_train.head())


BigFive Train: (6000, 8) | src: e:\tugas-akhir-qiqi\Dataset\Train\annotation\annotation_training.pkl
BigFive Val  : (2000, 8) | src: e:\tugas-akhir-qiqi\Dataset\Val\annotation\annotation_validation.pkl
BigFive Test : (2000, 8) | src: e:\tugas-akhir-qiqi\Dataset\Test\annotation\annotation_test.pkl


Unnamed: 0,clip_id,extraversion,neuroticism,agreeableness,conscientiousness,openness,interview,group_id
0,J4GQm9j0JZ0.003,0.523364,0.552083,0.626374,0.601942,0.488889,0.504673,J4GQm9j0JZ0
1,zEyRyTnIw5I.005,0.345794,0.375,0.472527,0.582524,0.366667,0.457944,zEyRyTnIw5I
2,nskJh7v6v1U.004,0.252336,0.291667,0.406593,0.485437,0.511111,0.373832,nskJh7v6v1U
3,6wHQsN5g2RM.000,0.457944,0.489583,0.505495,0.398058,0.377778,0.457944,6wHQsN5g2RM
4,dQOeQYWIgm8.000,0.607477,0.489583,0.406593,0.621359,0.622222,0.570093,dQOeQYWIgm8


In [16]:
def audio_clip_ids(audio_dir: Path) -> set[str]:
    return {p.stem for p in audio_dir.glob("*.wav")}

train_ids = audio_clip_ids(SPLITS["Train"]["audio_dir"])
val_ids   = audio_clip_ids(SPLITS["Val"]["audio_dir"])
test_ids  = audio_clip_ids(SPLITS["Test"]["audio_dir"])

print("audio counts:", len(train_ids), len(val_ids), len(test_ids))


audio counts: 6000 2000 2000


In [22]:
def build_official_meta(split_name: str) -> pd.DataFrame:
    if split_name == "Train":
        ids = train_ids
        demo = meta_dev
        big = big_train
        split_off = "train"
    elif split_name == "Val":
        ids = val_ids
        demo = meta_dev
        big = big_val
        split_off = "val"
    else:
        ids = test_ids
        demo = meta_test
        big = big_test
        split_off = "test"

    demo_s = demo[demo["clip_id"].isin(ids)].copy()
    big_s  = big[big["clip_id"].isin(ids)].copy()

    # merge demo + bigfive
    out = demo_s.merge(big_s.drop(columns=["group_id"]), on="clip_id", how="left", suffixes=("", "_bf"))
    out["split_official"] = split_off
    out["group_id"] = out["clip_id"].map(group_id_from_clip_id)

    # rapihin kolom (optional)
    prefer_cols = [
        "group_id","clip_id","split_official",
        "VideoName","Ethnicity","Gender","AgeGroup",
        "extraversion","neuroticism","agreeableness","conscientiousness","openness"
    ]
    if "interview" in out.columns:
        prefer_cols.append("interview")

    cols = [c for c in prefer_cols if c in out.columns] + [c for c in out.columns if c not in prefer_cols]
    out = out[cols]
    out = out.drop(columns=["YouTubeID"], errors="ignore")
    return out

meta_train_off = build_official_meta("Train")
meta_val_off   = build_official_meta("Val")
meta_test_off  = build_official_meta("Test")

# meta_master = gabungan train+val+test (10k) untuk strict split
meta_master = pd.concat([meta_train_off, meta_val_off, meta_test_off], ignore_index=True)

# sanity checks
print("meta_train_off:", meta_train_off.shape)
print("meta_val_off  :", meta_val_off.shape)
print("meta_test_off :", meta_test_off.shape)
print("meta_master   :", meta_master.shape)

# pastikan unique clip_id
dup = meta_master["clip_id"].duplicated().sum()
print("duplicated clip_id in meta_master:", dup)

# save
(meta_train_off).to_csv(OUT_DIR / "meta_train_official.csv", index=False)
(meta_val_off).to_csv(OUT_DIR / "meta_val_official.csv", index=False)
(meta_test_off).to_csv(OUT_DIR / "meta_test_official.csv", index=False)
(meta_master).to_csv(OUT_DIR / "meta_master.csv", index=False)

print("\nSaved to:", OUT_DIR.resolve())


meta_train_off: (6000, 13)
meta_val_off  : (2000, 13)
meta_test_off : (2000, 13)
meta_master   : (10000, 13)
duplicated clip_id in meta_master: 0

Saved to: E:\tugas-akhir-qiqi\output\preprocessing


# **PART 2 - Preprocess Audio**

In [23]:
OUT_DIR = ROOT / "output" / "preprocessing"

df = pd.read_csv(OUT_DIR / "meta_master.csv")
df["split_official"] = df["split_official"].astype(str).str.lower()

AUDIO_DIR_MAP = {
    "train": SPLITS["Train"]["audio_dir"],
    "val":   SPLITS["Val"]["audio_dir"],
    "test":  SPLITS["Test"]["audio_dir"],
}

df["audio_path"] = df.apply(lambda r: AUDIO_DIR_MAP[r["split_official"]] / f"{r['clip_id']}.wav", axis=1)

print("Loaded:", df.shape)
print(df["split_official"].value_counts(dropna=False))
df[["clip_id","split_official","audio_path"]].head()


Loaded: (10000, 14)
split_official
train    6000
val      2000
test     2000
Name: count, dtype: int64


Unnamed: 0,clip_id,split_official,audio_path
0,--Ymqszjv54.001,train,e:\tugas-akhir-qiqi\Dataset\Train\train\audio\...
1,--Ymqszjv54.003,train,e:\tugas-akhir-qiqi\Dataset\Train\train\audio\...
2,--Ymqszjv54.004,train,e:\tugas-akhir-qiqi\Dataset\Train\train\audio\...
3,--Ymqszjv54.005,train,e:\tugas-akhir-qiqi\Dataset\Train\train\audio\...
4,-2qsCrkXdWs.001,train,e:\tugas-akhir-qiqi\Dataset\Train\train\audio\...


## **CEK SAMPLE RATE, Channel, dan Durasi (hanya 5 sampel)**

In [24]:
paths = df["audio_path"].tolist()

exists = []
for p in tqdm(paths, desc="Checking audio_path exists"):
    exists.append(Path(p).exists())

df["audio_exists"] = exists

print(df["audio_exists"].value_counts())

miss = df.loc[~df["audio_exists"], ["clip_id","split_official"]].head(10)
print("missing (first 10):")
display(miss)

ok = df.loc[df["audio_exists"]].head(5)
for _, r in ok.iterrows():
    ap = Path(r["audio_path"])
    with sf.SoundFile(ap) as f:
        dur = len(f) / f.samplerate
        print(r["clip_id"], "|", f.samplerate, "Hz |", f.channels, "ch |", round(dur, 2), "s")


Checking audio_path exists: 100%|██████████| 10000/10000 [00:01<00:00, 6988.09it/s]

audio_exists
True    10000
Name: count, dtype: int64
missing (first 10):





Unnamed: 0,clip_id,split_official


--Ymqszjv54.001 | 16000 Hz | 1 ch | 15.3 s
--Ymqszjv54.003 | 16000 Hz | 1 ch | 15.3 s
--Ymqszjv54.004 | 16000 Hz | 1 ch | 15.3 s
--Ymqszjv54.005 | 16000 Hz | 1 ch | 15.3 s
-2qsCrkXdWs.001 | 16000 Hz | 1 ch | 15.3 s


In [26]:
df = df.drop(columns=["audio_exists", "audio_path"], errors="ignore")
df.head()


Unnamed: 0,group_id,clip_id,split_official,VideoName,Ethnicity,Gender,AgeGroup,extraversion,neuroticism,agreeableness,conscientiousness,openness,interview
0,--Ymqszjv54,--Ymqszjv54.001,train,--Ymqszjv54.001.mp4,2,1,5,0.551402,0.5,0.527473,0.650485,0.744444,0.588785
1,--Ymqszjv54,--Ymqszjv54.003,train,--Ymqszjv54.003.mp4,2,1,5,0.392523,0.427083,0.516484,0.475728,0.466667,0.392523
2,--Ymqszjv54,--Ymqszjv54.004,train,--Ymqszjv54.004.mp4,2,1,5,0.317757,0.322917,0.549451,0.368932,0.544444,0.401869
3,--Ymqszjv54,--Ymqszjv54.005,train,--Ymqszjv54.005.mp4,2,1,5,0.299065,0.291667,0.373626,0.320388,0.344444,0.280374
4,-2qsCrkXdWs,-2qsCrkXdWs.001,train,-2qsCrkXdWs.001.mp4,2,1,2,0.476636,0.604167,0.593407,0.572816,0.611111,0.560748


## **CEK TOTAL DURASI DAN STATISTIK DURASI**

In [28]:
# EDA ringkas: sr, channels, duration (scan semua audio) + summary

# mapping split_official -> audio_dir
AUDIO_DIR_MAP = {
    "train": SPLITS["Train"]["audio_dir"],
    "val":   SPLITS["Val"]["audio_dir"],
    "test":  SPLITS["Test"]["audio_dir"],
}

sr_list, ch_list, dur_list, ok_list, err_list = [], [], [], [], []

for _, r in tqdm(df.iterrows(), total=len(df), desc="Scanning audio metadata"):
    split = str(r["split_official"]).lower()
    clip_id = str(r["clip_id"])
    ap = AUDIO_DIR_MAP[split] / f"{clip_id}.wav"

    try:
        with sf.SoundFile(ap) as f:
            sr = int(f.samplerate)
            ch = int(f.channels)
            frames = int(len(f))
            dur = frames / sr if sr > 0 else 0.0
        sr_list.append(sr); ch_list.append(ch); dur_list.append(dur)
        ok_list.append(True); err_list.append("")
    except Exception as e:
        sr_list.append(np.nan); ch_list.append(np.nan); dur_list.append(np.nan)
        ok_list.append(False); err_list.append(str(e)[:120])

df_eda = df.copy()
df_eda["decode_ok"] = ok_list
df_eda["sr"] = sr_list
df_eda["channels"] = ch_list
df_eda["duration_sec"] = dur_list
df_eda["decode_err"] = err_list

print("Decode OK:", int(df_eda["decode_ok"].sum()), "/", len(df_eda))
print("\nSample rate counts:")
print(df_eda.loc[df_eda["decode_ok"], "sr"].value_counts().head(10))

print("\nChannel counts:")
print(df_eda.loc[df_eda["decode_ok"], "channels"].value_counts())

dur_ok = df_eda.loc[df_eda["decode_ok"], "duration_sec"]
print("\nDuration summary (sec):")
print(dur_ok.describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]).round(3))

print("\n< 1.0s count:", int((dur_ok < 1.0).sum()))
print("< 2.0s count:", int((dur_ok < 2.0).sum()))
print("< 15.0s count:", int((dur_ok < 15.0).sum()))
print("> 15.0s count:", int((dur_ok > 15.0).sum()))


Scanning audio metadata: 100%|██████████| 10000/10000 [01:55<00:00, 86.31it/s]

Decode OK: 10000 / 10000

Sample rate counts:
sr
16000    10000
Name: count, dtype: int64

Channel counts:
channels
1    10000
Name: count, dtype: int64

Duration summary (sec):
count    10000.000
mean        15.271
std          0.471
min          2.067
1%          15.302
5%          15.302
50%         15.302
95%         15.302
99%         15.302
max         15.302
Name: duration_sec, dtype: float64

< 1.0s count: 0
< 2.0s count: 0
< 15.0s count: 73
> 15.0s count: 9927





## **TRIM & PAD 15 Second**

In [30]:
# Trim/pad semua audio jadi tepat 15.0 detik (16k mono) + progress bar

TARGET_SR = 16000
TARGET_SEC = 15.0
TARGET_LEN = int(TARGET_SR * TARGET_SEC)

OUT_DIR = Path("output/preprocessing")
OUT_WAV_DIR = OUT_DIR / "preprocessed_full"
OUT_WAV_DIR.mkdir(parents=True, exist_ok=True)

AUDIO_DIR_MAP = {
    "train": SPLITS["Train"]["audio_dir"],
    "val":   SPLITS["Val"]["audio_dir"],
    "test":  SPLITS["Test"]["audio_dir"],
}

rows = []

for _, r in tqdm(df.iterrows(), total=len(df), desc="Trim/Pad to 15s"):
    split = str(r["split_official"]).lower()
    clip_id = str(r["clip_id"])
    in_path = AUDIO_DIR_MAP[split] / f"{clip_id}.wav"
    out_path = OUT_WAV_DIR / f"{clip_id}.wav"

    x, sr = sf.read(in_path, always_2d=False)

    # safety checks (harusnya sudah 16k mono, tapi tetap aman)
    if isinstance(x, np.ndarray) and x.ndim > 1:
        x = x.mean(axis=1)
    if sr != TARGET_SR:
        raise ValueError(f"Unexpected sr={sr} for {clip_id}. Expected {TARGET_SR}.")

    orig_len = len(x)
    orig_dur = orig_len / sr if sr else 0.0

    flag_short = int(orig_len < TARGET_LEN)

    if orig_len > TARGET_LEN:
        x15 = x[:TARGET_LEN]
    elif orig_len < TARGET_LEN:
        x15 = np.pad(x, (0, TARGET_LEN - orig_len), mode="constant")
    else:
        x15 = x

    sf.write(out_path, x15, TARGET_SR, subtype="PCM_16")

    rows.append({
        "clip_id": clip_id,
        "split_official": split,
        "audio_in": str(in_path),
        "audio_out": str(out_path),
        "orig_duration_sec": orig_dur,
        "flag_short": flag_short
    })

trim_manifest = pd.DataFrame(rows)
trim_manifest.to_csv(OUT_DIR / "trim_pad_manifest.csv", index=False)

print("Saved:", OUT_WAV_DIR)
print("Saved:", OUT_DIR / "trim_pad_manifest.csv")
print("flag_short counts:\n", trim_manifest["flag_short"].value_counts())


Trim/Pad to 15s: 100%|██████████| 10000/10000 [02:06<00:00, 78.81it/s]

Saved: output\preprocessing\preprocessed_full
Saved: output\preprocessing\trim_pad_manifest.csv
flag_short counts:
 flag_short
0    9927
1      73
Name: count, dtype: int64





In [8]:
import sys, site, pkgutil
print("Python exe:", sys.executable)
print("Python ver:", sys.version)
print("User site :", site.getusersitepackages())
print("torch exists in env?:", any(m.name=="torch" for m in pkgutil.iter_modules()))


Python exe: c:\Users\aquq1\AppData\Local\Programs\Python\Python312\python.exe
Python ver: 3.12.2 (tags/v3.12.2:6abddd9, Feb  6 2024, 21:26:36) [MSC v.1937 64 bit (AMD64)]
User site : C:\Users\aquq1\AppData\Roaming\Python\Python312\site-packages
torch exists in env?: True


## **SILERO VAD PART 1**

In [8]:
# Silero VAD check: hitung speech_sec, voiced_ratio, n_seg untuk semua audio 15s

OUT_DIR = ROOT / "output" / "preprocessing"
IN_MANIFEST = OUT_DIR / "trim_pad_manifest.csv"
VAD_REPORT = OUT_DIR / "vad" / "vad_report.csv"
VAD_DROP = OUT_DIR / "vad" / "vad_drop.csv"
VAD_DROP.parent.mkdir(parents=True, exist_ok=True)
VAD_REPORT.parent.mkdir(parents=True, exist_ok=True)


SR = 16000
FIXED_SEC = 15.0
MIN_SPEECH_SEC = 2.0  # nanti dipakai buat drop list

# load manifest hasil trim/pad
trim_manifest = pd.read_csv(IN_MANIFEST)
print("trim_manifest:", trim_manifest.shape)

# load silero vad
try:
    import torch
except Exception as e:
    raise RuntimeError("torch belum terinstall. Install dulu: pip install torch") from e

torch.set_grad_enabled(False)

model, utils = torch.hub.load(
    repo_or_dir="snakers4/silero-vad",
    model="silero_vad",
    force_reload=False
)
(get_speech_timestamps, _, _, _, _) = utils
model.eval()

rows = []
for _, r in tqdm(trim_manifest.iterrows(), total=len(trim_manifest), desc="Silero VAD"):
    clip_id = str(r["clip_id"])
    ap = ROOT / Path(r["audio_out"])  # sudah 16k mono 15s

    x, sr = sf.read(ap, always_2d=False)
    if sr != SR:
        raise ValueError(f"sr mismatch for {clip_id}: {sr}")

    if isinstance(x, np.ndarray) and x.ndim > 1:
        x = x.mean(axis=1)
    x = x.astype(np.float32)

    wav = torch.from_numpy(x)
    ts = get_speech_timestamps(wav, model, sampling_rate=SR)

    speech_sec = sum((t["end"] - t["start"]) for t in ts) / SR
    n_seg = len(ts)
    voiced_ratio = speech_sec / FIXED_SEC

    rows.append({
        "clip_id": clip_id,
        "speech_sec": float(speech_sec),
        "voiced_ratio": float(voiced_ratio),
        "n_seg": int(n_seg),
    })

vad_df = pd.DataFrame(rows)
vad_df.to_csv(VAD_REPORT, index=False)

drop_df = vad_df[vad_df["speech_sec"] < MIN_SPEECH_SEC].copy()
drop_df["reason"] = "too_little_speech"
drop_df.to_csv(VAD_DROP, index=False)

print("Saved:", VAD_REPORT)
print("Saved:", VAD_DROP)
print("Drop count (speech_sec < 2s):", len(drop_df))
print("\nSpeech_sec summary:")
print(vad_df["speech_sec"].describe(percentiles=[0.01,0.05,0.5,0.95,0.99]).round(3))


trim_manifest: (10000, 6)


Using cache found in C:\Users\aquq1/.cache\torch\hub\snakers4_silero-vad_master
Silero VAD: 100%|██████████| 10000/10000 [43:53<00:00,  3.80it/s] 


Saved: e:\tugas-akhir-qiqi\output\preprocessing\vad\vad_report.csv
Saved: e:\tugas-akhir-qiqi\output\preprocessing\vad\vad_drop.csv
Drop count (speech_sec < 2s): 42

Speech_sec summary:
count    10000.000
mean        13.103
std          1.906
min          0.000
1%           5.721
5%           9.663
50%         13.612
95%         14.934
99%         15.000
max         15.000
Name: speech_sec, dtype: float64


## **CEK ULANG VAD DROP**

karena ditemukan beberapa anomali setelah di crosscheck seperti ada audio yang belum ter-ekstrak dengan baik ataupun bug ekstraksi, dan juga ada beberapa audio dengan case khusus seperti suara bisik bisik yang butuh tuning VAD yang lebih baik


In [14]:
import imageio_ffmpeg
import subprocess
from pathlib import Path
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm

ROOT = Path.cwd().parent
DATASET = ROOT / "Dataset"
OUT_DIR = ROOT / "output" / "preprocessing"

VAD_DROP_PATH = OUT_DIR / "vad" / "vad_drop.csv"
drop_df = pd.read_csv(VAD_DROP_PATH)
drop_ids = drop_df["clip_id"].astype(str).unique().tolist()
print("Drop clips:", len(drop_ids))

meta = pd.read_csv(OUT_DIR / "meta_master.csv")
meta["split_official"] = meta["split_official"].astype(str).str.lower()

VIDEO_DIR_MAP = {
    "train": DATASET / "Train" / "train",
    "val":   DATASET / "Val" / "val",
    "test":  DATASET / "Test" / "test",
}

RAW_REEX_DIR = OUT_DIR / "reextract_raw"
FIXED15_DIR  = OUT_DIR / "preprocessed_full_retry"
RAW_REEX_DIR.mkdir(parents=True, exist_ok=True)
FIXED15_DIR.mkdir(parents=True, exist_ok=True)

ffmpeg = imageio_ffmpeg.get_ffmpeg_exe()

TARGET_SR = 16000
TARGET_SEC = 15.0
TARGET_LEN = int(TARGET_SR * TARGET_SEC)

PEAK_SILENT_THR = 1e-4
RMS_SILENT_THR  = 1e-5

STREAM_CANDIDATES = ["0:a:0", "0:a:1", "0:a:2", "0:a:3"]

def rms(x: np.ndarray) -> float:
    x = np.asarray(x)
    if x.size == 0:
        return 0.0
    return float(np.sqrt(np.mean(np.square(x.astype(np.float64)))))

def peak(x: np.ndarray) -> float:
    x = np.asarray(x)
    if x.size == 0:
        return 0.0
    return float(np.max(np.abs(x)))

def best_mono_from_stereo(x: np.ndarray):
    """
    x: (n,2) stereo atau (n,) mono.
    Return: mono_signal, chosen_mode (L/R/mean)
    """
    if x.ndim == 1:
        return x, "mono"

    L = x[:, 0]
    R = x[:, 1]
    M = 0.5 * (L + R)  # mean (bisa cancel kalau beda fase)

    # pilih yang RMS paling besar (anti cancellation)
    cands = [("L", L), ("R", R), ("mean", M)]
    best_mode, best_sig = max(cands, key=lambda t: rms(t[1]))
    return best_sig, best_mode

rows = []
fails = []

for clip_id in tqdm(drop_ids, desc="Re-extract (anti-phase) + fix 15s"):
    row = meta.loc[meta["clip_id"].astype(str) == clip_id]
    if row.empty:
        fails.append({"clip_id": clip_id, "reason": "clip_not_in_meta"})
        continue

    split = row.iloc[0]["split_official"]
    video_path = VIDEO_DIR_MAP[split] / f"{clip_id}.mp4"
    if not video_path.exists():
        fails.append({"clip_id": clip_id, "reason": "video_missing", "video_path": str(video_path)})
        continue

    out_wav = FIXED15_DIR / f"{clip_id}.wav"

    best = None
    # best = (score_rms, best_raw_path, stream, chosen_mode, pk, rms)

    for stream in STREAM_CANDIDATES:
        raw_wav = RAW_REEX_DIR / f"{clip_id}__{stream.replace(':','_')}.wav"

        # Extract as STEREO (ac=2) to avoid cancellation
        cmd = [
            ffmpeg, "-y",
            "-i", str(video_path),
            "-map", stream,
            "-vn",
            "-ac", "2",
            "-ar", str(TARGET_SR),
            "-acodec", "pcm_s16le",
            str(raw_wav)
        ]
        r = subprocess.run(cmd, capture_output=True, text=True)

        if r.returncode != 0 or (not raw_wav.exists()):
            continue

        try:
            x, sr = sf.read(raw_wav, always_2d=True)  # always_2d biar (n, ch)
            if sr != TARGET_SR:
                continue

            mono, mode = best_mono_from_stereo(x)
            pk = peak(mono)
            rr = rms(mono)

            # silent guard
            if pk < PEAK_SILENT_THR or rr < RMS_SILENT_THR:
                continue

            # score pakai RMS (pilih yang paling "kenceng" & berisi)
            score = rr
            if (best is None) or (score > best[0]):
                best = (score, raw_wav, stream, mode, pk, rr)

        except Exception:
            continue

    if best is None:
        fails.append({"clip_id": clip_id, "reason": "no_valid_audio_stream_or_silent", "video_path": str(video_path)})
        continue

    _, best_raw, best_stream, best_mode, best_pk, best_rms = best

    # Read lagi best_raw dan bikin mono final sesuai mode
    x, sr = sf.read(best_raw, always_2d=True)
    mono, _ = best_mono_from_stereo(x)

    # OPTIONAL: peak normalize ringan biar bisik kebantu (nggak mengubah konten)
    pk = peak(mono) + 1e-9
    mono = (mono / pk) * 0.98

    orig_len = len(mono)
    flag_short = int(orig_len < TARGET_LEN)

    if orig_len > TARGET_LEN:
        x15 = mono[:TARGET_LEN]
    elif orig_len < TARGET_LEN:
        x15 = np.pad(mono, (0, TARGET_LEN - orig_len), mode="constant")
    else:
        x15 = mono

    sf.write(out_wav, x15.astype(np.float32), TARGET_SR, subtype="PCM_16")

    rows.append({
        "clip_id": clip_id,
        "split_official": split,
        "video_path": str(video_path),
        "best_stream": best_stream,
        "mono_mode": best_mode,          # L / R / mean / mono
        "raw_wav": str(best_raw),
        "audio_out": str(out_wav),
        "orig_duration_sec": orig_len / TARGET_SR,
        "flag_short": flag_short,
        "best_rms": best_rms,
        "best_peak": best_pk
    })

retry_manifest = pd.DataFrame(rows)
retry_fails = pd.DataFrame(fails)

retry_manifest.to_csv(OUT_DIR / "trim_pad_manifest_retry.csv", index=False)
retry_fails.to_csv(OUT_DIR / "reextract_failed.csv", index=False)

print("Saved:", OUT_DIR / "trim_pad_manifest_retry.csv")
print("Saved:", OUT_DIR / "reextract_failed.csv")
print("ok:", len(retry_manifest), "| failed:", len(retry_fails))


Drop clips: 42


Re-extract (anti-phase) + fix 15s: 100%|██████████| 42/42 [00:09<00:00,  4.47it/s]

Saved: e:\tugas-akhir-qiqi\output\preprocessing\trim_pad_manifest_retry.csv
Saved: e:\tugas-akhir-qiqi\output\preprocessing\reextract_failed.csv
ok: 42 | failed: 0





In [15]:
OUT_DIR = ROOT / Path("output/preprocessing")
man = pd.read_csv(OUT_DIR / "trim_pad_manifest_retry.csv")
print("retry files:", man.shape)

VAD_DIR = OUT_DIR / "vad"
VAD_DIR.mkdir(parents=True, exist_ok=True)

# load silero
torch.set_grad_enabled(False)
model, utils = torch.hub.load("snakers4/silero-vad", "silero_vad", force_reload=False)
(get_speech_timestamps, _, _, _, _) = utils
model.eval()

SR = 16000
FIXED_SEC = 15.0
MIN_SPEECH_SEC = 2.0

# tuning: lebih sensitif untuk bisik-bisik
VAD_KW = dict(
    threshold=0.35,              # default biasanya 0.5
    min_speech_duration_ms=100,  # lebih kecil
    min_silence_duration_ms=50,  # lebih kecil
    speech_pad_ms=30
)

rows = []
for _, r in tqdm(man.iterrows(), total=len(man), desc="Silero VAD (tuned)"):
    clip_id = str(r["clip_id"])
    ap = Path(r["audio_out"])

    x, sr = sf.read(ap, always_2d=False)
    if isinstance(x, np.ndarray) and x.ndim > 1:
        x = x.mean(axis=1)
    if sr != SR:
        raise ValueError(f"sr mismatch {clip_id}: {sr}")

    x = x.astype(np.float32)

    # optional: peak normalize biar bisik lebih kebaca
    peak = float(np.max(np.abs(x)) + 1e-9)
    x = (x / peak) * 0.98

    ts = get_speech_timestamps(torch.from_numpy(x), model, sampling_rate=SR, **VAD_KW)
    speech_sec = sum((t["end"] - t["start"]) for t in ts) / SR
    n_seg = len(ts)
    voiced_ratio = speech_sec / FIXED_SEC

    rows.append({
        "clip_id": clip_id,
        "speech_sec": float(speech_sec),
        "voiced_ratio": float(voiced_ratio),
        "n_seg": int(n_seg),
    })

vad_retry = pd.DataFrame(rows)
vad_retry.to_csv(VAD_DIR / "vad_report_retry.csv", index=False)

drop_retry = vad_retry[vad_retry["speech_sec"] < MIN_SPEECH_SEC].copy()
drop_retry["reason"] = "too_little_speech"
drop_retry.to_csv(VAD_DIR / "vad_drop_retry.csv", index=False)

print("Saved:", VAD_DIR / "vad_report_retry.csv")
print("Saved:", VAD_DIR / "vad_drop_retry.csv")
print("Still drop (<2s):", len(drop_retry))


retry files: (42, 11)


Using cache found in C:\Users\aquq1/.cache\torch\hub\snakers4_silero-vad_master
Silero VAD (tuned): 100%|██████████| 42/42 [00:10<00:00,  4.08it/s]

Saved: e:\tugas-akhir-qiqi\output\preprocessing\vad\vad_report_retry.csv
Saved: e:\tugas-akhir-qiqi\output\preprocessing\vad\vad_drop_retry.csv
Still drop (<2s): 26





In [16]:
from pathlib import Path
import shutil
import pandas as pd

BASE = ROOT / Path("output/preprocessing")

FULL_DIR = BASE / "preprocessed_full"
RETRY_DIR = BASE / "preprocessed_full_retry"
RAW_DIR = BASE / "reextract_raw"

MANIFEST = BASE / "trim_pad_manifest.csv"
MANIFEST_RETRY = BASE / "trim_pad_manifest_retry.csv"

VAD_DIR = BASE / "vad"
VAD_REPORT = VAD_DIR / "vad_report.csv"
VAD_REPORT_RETRY = VAD_DIR / "vad_report_retry.csv"
VAD_DROP = VAD_DIR / "vad_drop.csv"
VAD_DROP_RETRY = VAD_DIR / "vad_drop_retry.csv"

REEX_FAILED = BASE / "reextract_failed.csv"

# 1) overwrite audio files
if RETRY_DIR.exists():
    retry_wavs = list(RETRY_DIR.glob("*.wav"))
    print("Retry wavs:", len(retry_wavs))
    for src in retry_wavs:
        dst = FULL_DIR / src.name
        shutil.copy2(src, dst)  # overwrite
else:
    print("No retry dir:", RETRY_DIR)

# helper: patch csv by clip_id
def patch_by_clip_id(base_csv: Path, patch_csv: Path, out_csv: Path):
    base_df = pd.read_csv(base_csv)
    patch_df = pd.read_csv(patch_csv)

    base_df["clip_id"] = base_df["clip_id"].astype(str)
    patch_df["clip_id"] = patch_df["clip_id"].astype(str)

    # remove base rows that will be replaced, then append patch, then sort (optional)
    base_df = base_df[~base_df["clip_id"].isin(set(patch_df["clip_id"]))].copy()
    merged = pd.concat([base_df, patch_df], ignore_index=True)

    merged.to_csv(out_csv, index=False)
    return len(patch_df)

# 2) patch trim_pad_manifest.csv
if MANIFEST.exists() and MANIFEST_RETRY.exists():
    n = patch_by_clip_id(MANIFEST, MANIFEST_RETRY, MANIFEST)
    print("Patched trim_pad_manifest rows:", n)

# 3) patch vad_report.csv
if VAD_REPORT.exists() and VAD_REPORT_RETRY.exists():
    n = patch_by_clip_id(VAD_REPORT, VAD_REPORT_RETRY, VAD_REPORT)
    print("Patched vad_report rows:", n)

# 4) vad_drop.csv = vad_drop_retry.csv
if VAD_DROP_RETRY.exists():
    shutil.copy2(VAD_DROP_RETRY, VAD_DROP)
    print("Replaced vad_drop.csv with vad_drop_retry.csv")

# 5) delete reextract_failed.csv if empty
if REEX_FAILED.exists():
    try:
        df_fail = pd.read_csv(REEX_FAILED)
        if len(df_fail) == 0:
            REEX_FAILED.unlink()
            print("Deleted empty reextract_failed.csv")
    except Exception:
        pass

# 6) cleanup temp dirs/files (optional but recommended)
#   delete retry dir and raw dir, and retry csv files (optional)
if RETRY_DIR.exists():
    shutil.rmtree(RETRY_DIR)
    print("Deleted dir:", RETRY_DIR)

if RAW_DIR.exists():
    shutil.rmtree(RAW_DIR)
    print("Deleted dir:", RAW_DIR)

# remove retry csvs (optional)
for p in [MANIFEST_RETRY, VAD_REPORT_RETRY, VAD_DROP_RETRY]:
    if p.exists():
        p.unlink()
        print("Deleted file:", p)

print("\nDone. Current key files:")
print("-", FULL_DIR)
print("-", MANIFEST)
print("-", VAD_REPORT)
print("-", VAD_DROP)


Retry wavs: 42
Patched trim_pad_manifest rows: 42
Patched vad_report rows: 42
Replaced vad_drop.csv with vad_drop_retry.csv
Deleted dir: e:\tugas-akhir-qiqi\output\preprocessing\preprocessed_full_retry
Deleted dir: e:\tugas-akhir-qiqi\output\preprocessing\reextract_raw
Deleted file: e:\tugas-akhir-qiqi\output\preprocessing\trim_pad_manifest_retry.csv
Deleted file: e:\tugas-akhir-qiqi\output\preprocessing\vad\vad_report_retry.csv
Deleted file: e:\tugas-akhir-qiqi\output\preprocessing\vad\vad_drop_retry.csv

Done. Current key files:
- e:\tugas-akhir-qiqi\output\preprocessing\preprocessed_full
- e:\tugas-akhir-qiqi\output\preprocessing\trim_pad_manifest.csv
- e:\tugas-akhir-qiqi\output\preprocessing\vad\vad_report.csv
- e:\tugas-akhir-qiqi\output\preprocessing\vad\vad_drop.csv
