In [13]:
# import dan setup
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchaudio

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

# path penting
CSV_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\data\training\multimodal_splits\audio_only_dataset.csv"
AUDIO_BASE_DIR = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection"

MODEL2_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\audio_baseline\best_audio_cnn.pt"
MODEL4_HEAD_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\audio_baseline\best_audio_wav2vec2.pt"
EMB_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\src\modeling\unimodal\audio\audio_embeddings_precomputed.npz"

TARGET_SR = 16000
SEGMENT_SEC = 10
N_MELS = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [3]:
df = pd.read_csv(CSV_PATH)

label_map = {"hoax": 0, "valid": 1}
df["y"] = df["label"].map(label_map)

print("Total distribusi label:\n", df["y"].value_counts())

train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["y"],
    random_state=42,   # konsisten dengan training
)

print("Train label dist:\n", train_df["y"].value_counts())
print("Val   label dist:\n", val_df["y"].value_counts())


Total distribusi label:
 y
1    156
0     52
Name: count, dtype: int64
Train label dist:
 y
1    125
0     41
Name: count, dtype: int64
Val   label dist:
 y
1    31
0    11
Name: count, dtype: int64


In [4]:
# Cell 3: dataset CNN

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=TARGET_SR,
    n_fft=400,
    hop_length=160,
    win_length=400,
    n_mels=N_MELS,
)
amplitude_to_db = torchaudio.transforms.AmplitudeToDB()

def load_center_segment(path, segment_sec=SEGMENT_SEC):
    wav, sr = torchaudio.load(path)
    if sr != TARGET_SR:
        wav = torchaudio.functional.resample(wav, sr, TARGET_SR)
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)

    num_samples = wav.shape[1]
    seg_len = int(TARGET_SR * segment_sec)

    if num_samples <= seg_len:
        pad_len = seg_len - num_samples
        wav = nn.functional.pad(wav, (0, pad_len))
    else:
        center = num_samples // 2
        start = max(0, center - seg_len // 2)
        end = start + seg_len
        if end > num_samples:
            end = num_samples
            start = end - seg_len
        wav = wav[:, start:end]

    return wav

def wav_to_logmel(wav):
    mel = mel_spectrogram(wav)
    mel_db = amplitude_to_db(mel)
    mean = mel_db.mean()
    std = mel_db.std() + 1e-9
    mel_db = (mel_db - mean) / std
    return mel_db

class AudioHoaxDatasetCNN(Dataset):
    def __init__(self, df, base_dir=AUDIO_BASE_DIR):
        self.df = df.reset_index(drop=True)
        self.base_dir = base_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio_path = os.path.join(self.base_dir, row["audio_path"])
        wav = load_center_segment(audio_path)
        feat = wav_to_logmel(wav)          # [1, F, T]
        label = torch.tensor(row["y"], dtype=torch.long)
        return feat, label

val_ds_cnn = AudioHoaxDatasetCNN(val_df)
val_loader_cnn = DataLoader(
    val_ds_cnn,
    batch_size=8,
    shuffle=False,
    num_workers=0,
)


In [5]:
# Cell 4: model CNN

class AudioCNN(nn.Module):
    def __init__(self, n_mels=N_MELS, num_classes=2):
        super().__init__()
        self.conv_block = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
        )
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Sequential(
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(16, num_classes),
        )

    def forward(self, x):
        h = self.conv_block(x)
        h = self.global_pool(h)
        h = h.view(h.size(0), -1)
        logits = self.fc(h)
        return logits


In [6]:
# Cell 5: fungsi evaluasi

def eval_cnn(model, loader):
    model.eval()
    all_y, all_pred = [], []
    with torch.no_grad():
        for feats, labels in loader:
            feats = feats.to(device)
            labels = labels.to(device)
            logits = model(feats)
            preds = torch.argmax(logits, dim=-1)
            all_y.extend(labels.cpu().numpy().tolist())
            all_pred.extend(preds.cpu().numpy().tolist())

    print(classification_report(all_y, all_pred, digits=4))
    acc = accuracy_score(all_y, all_pred)
    f1_valid = f1_score(all_y, all_pred, pos_label=1)
    f1_hoax  = f1_score(all_y, all_pred, pos_label=0)
    f1_macro = f1_score(all_y, all_pred, average="macro")
    print("Acc:", acc, "F1_hoax:", f1_hoax, "F1_valid:", f1_valid, "F1_macro:", f1_macro)
    return acc, f1_hoax, f1_valid, f1_macro


def eval_head_v4(model, loader):
    model.eval()
    all_y, all_pred = [], []
    with torch.no_grad():
        for emb, labels in loader:
            emb = emb.to(device)
            labels = labels.to(device)
            logits = model(emb)
            preds = torch.argmax(logits, dim=-1)
            all_y.extend(labels.cpu().numpy().tolist())
            all_pred.extend(preds.cpu().numpy().tolist())

    print(classification_report(all_y, all_pred, digits=4))
    acc = accuracy_score(all_y, all_pred)
    f1_valid = f1_score(all_y, all_pred, pos_label=1)
    f1_hoax  = f1_score(all_y, all_pred, pos_label=0)
    f1_macro = f1_score(all_y, all_pred, average="macro")
    print("Acc:", acc, "F1_hoax:", f1_hoax, "F1_valid:", f1_valid, "F1_macro:", f1_macro)
    return acc, f1_hoax, f1_valid, f1_macro


In [9]:
# Cell 6: dataset embedding v4 (VERSI DIBETULKAN)

data_npz = np.load(EMB_PATH, allow_pickle=True)
embeddings_np = data_npz["embeddings"]   # array/list of emb
ids_np       = data_npz["ids"]           # optional, kalau mau dicek
paths_np     = data_npz["paths"]         # optional

print("Total embeddings:", len(embeddings_np))

class EmbeddingDataset(Dataset):
    def __init__(self, df, embeddings_np):
        """
        df      : DataFrame audio_only_dataset.csv (sudah diurutkan sama seperti saat precompute)
        embeddings_np : array/list of np.ndarray, satu embedding per baris df
        """
        self.df = df.reset_index(drop=True)
        self.embs = embeddings_np

    def __len__(self):
        return len(self.embs)

    def __getitem__(self, idx):
        emb = self.embs[idx]                       # np.ndarray, shape [T, 768] atau [1, 768]
        emb_t = torch.tensor(emb, dtype=torch.float32)
        label = int(self.df.iloc[idx]["y"])        # kolom y = 0/1 (hoax/valid)
        return emb_t, torch.tensor(label, dtype=torch.long)

emb_dataset = EmbeddingDataset(df, embeddings_np)
sample_emb, _ = emb_dataset[0]
print("Sample emb shape:", sample_emb.shape)

emb_loader = DataLoader(
    emb_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=0,
)


Total embeddings: 208
Sample emb shape: torch.Size([499, 768])


In [10]:
# Cell 7a: head sederhana (kalau emb sudah [D])

class AudioHeadV4(nn.Module):
    def __init__(self, dim=768, num_labels=2):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_labels),
        )

    def forward(self, x):    # x: [B, D]
        return self.mlp(x)


In [11]:
# Cell 7b: pakai attentive pooling (kalau emb [T, D])

class AttentivePool(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.proj = nn.Linear(dim, dim)
        self.context = nn.Linear(dim, 1, bias=False)

    def forward(self, x):        # x: [B, T, D]
        h = torch.tanh(self.proj(x))
        scores = self.context(h).squeeze(-1)      # [B, T]
        attn = torch.softmax(scores, dim=-1)      # [B, T]
        pooled = torch.bmm(attn.unsqueeze(1), x).squeeze(1)  # [B, D]
        return pooled

class AudioHeadV4(nn.Module):
    def __init__(self, dim=768, num_labels=2):
        super().__init__()
        self.pool = AttentivePool(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_labels),
        )

    def forward(self, x):        # x: [B, T, D]
        pooled = self.pool(x)
        logits = self.mlp(pooled)
        return logits


In [15]:
class AttentivePool(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear = nn.Linear(dim, dim)
        self.context = nn.Linear(dim, 1)

    def forward(self, x):          # x: [B, T, D]
        h = torch.tanh(self.linear(x))
        scores = self.context(h).squeeze(-1)   # [B, T]
        w = torch.softmax(scores, dim=1).unsqueeze(-1)
        return (w * x).sum(dim=1)              # [B, D]


class AudioHeadV4(nn.Module):
    def __init__(self, dim=768, hidden=256, nclass=2):
        super().__init__()
        self.pool = AttentivePool(dim)
        self.fc = nn.Sequential(
            nn.Linear(dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden, nclass),
        )

    def forward(self, x):          # x: [B, T, D]
        pooled = self.pool(x)
        logits = self.fc(pooled)
        return logits


In [16]:
# Cell 8: evaluasi & tabel ringkas

results = []

# ===== Model 2: CNN + weight =====
model2 = AudioCNN().to(device)
model2.load_state_dict(torch.load(MODEL2_PATH, map_location=device))
acc2, f1h2, f1v2, f1m2 = eval_cnn(model2, val_loader_cnn)
results.append(("Model 2 (CNN + weight)", acc2, f1h2, f1v2, f1m2))

# ===== Model 4: Wav2Vec2 v4 head =====
# emb_dataset mengembalikan (emb, label), emb: [T, D]
sample_emb, _ = emb_dataset[0]
dim_emb = sample_emb.shape[-1]

head_v4 = AudioHeadV4(dim=dim_emb, hidden=256, nclass=2).to(device)
state_dict = torch.load(MODEL4_HEAD_PATH, map_location=device)
head_v4.load_state_dict(state_dict)

acc4, f1h4, f1v4, f1m4 = eval_head_v4(head_v4, emb_loader)
results.append(("Model 4 (Wav2Vec2 v4)", acc4, f1h4, f1v4, f1m4))

print("\n=== Perbandingan Model Audio di Validation ===")
print("{:<25} {:>8} {:>10} {:>10} {:>10}".format(
    "Model", "Acc", "F1_hoax", "F1_valid", "F1_macro"
))
for name, acc, f1h, f1v, f1m in results:
    print("{:<25} {:>8.4f} {:>10.4f} {:>10.4f} {:>10.4f}".format(
        name, acc, f1h, f1v, f1m
    ))


  model2.load_state_dict(torch.load(MODEL2_PATH, map_location=device))


              precision    recall  f1-score   support

           0     0.4211    0.7273    0.5333        11
           1     0.8696    0.6452    0.7407        31

    accuracy                         0.6667        42
   macro avg     0.6453    0.6862    0.6370        42
weighted avg     0.7521    0.6667    0.6864        42

Acc: 0.6666666666666666 F1_hoax: 0.5333333333333333 F1_valid: 0.7407407407407407 F1_macro: 0.6370370370370371


  state_dict = torch.load(MODEL4_HEAD_PATH, map_location=device)


              precision    recall  f1-score   support

           0     0.5424    0.6154    0.5766        52
           1     0.8658    0.8269    0.8459       156

    accuracy                         0.7740       208
   macro avg     0.7041    0.7212    0.7112       208
weighted avg     0.7849    0.7740    0.7786       208

Acc: 0.7740384615384616 F1_hoax: 0.5765765765765766 F1_valid: 0.8459016393442623 F1_macro: 0.7112391079604194

=== Perbandingan Model Audio di Validation ===
Model                          Acc    F1_hoax   F1_valid   F1_macro
Model 2 (CNN + weight)      0.6667     0.5333     0.7407     0.6370
Model 4 (Wav2Vec2 v4)       0.7740     0.5766     0.8459     0.7112
