In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import torch
import torch.nn as nn
from torch.nn.functional import softmax
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertForSequenceClassification

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# Path penting
DATA_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\data\training\multimodal_splits\text_audio_dataset.csv"
AUDIO_EMB_PATH = "audio_embeddings_precomputed.npz"
AUDIO_HEAD_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\audio_baseline\best_audio_wav2vec2.pt"
TEXT_MODEL_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\text_baseline\indobert-base-p1"  

MAX_LEN = 128
BATCH_SIZE = 16


Device: cuda


In [2]:
df = pd.read_csv(DATA_PATH)

# Asumsikan ada kolom: sample_id, title, label, audio_path_full
print(df.head())

label_map = {"hoax": 0, "valid": 1}
df["label_int"] = df["label"].map(label_map)

print("Distribusi label:", df["label_int"].value_counts())


  sample_id label     data_source  confidence  sample_weight  \
0  YT_00081  hoax  predicted_hoax    0.905624       0.820156   
1  YT_00867  hoax  predicted_hoax    0.850997       0.724195   
2  YT_00904  hoax  predicted_hoax    0.772077       0.596103   
3  YT_01161  hoax  predicted_hoax    0.755714       0.571103   
4  YT_00039  hoax  predicted_hoax    0.711433       0.506137   

                                               title  \
0  Publik Geram, Hukum Tumpul di Kasus Silfester?...   
1  Wakil Presiden Gibran Rakabuming Raka Digugat ...   
2  Peran dan Ideologi Partai Politik di Indonesia...   
3  [BREAKING NEWS] Sederet Nama yang Dilantik Men...   
4  Apakah Benar Jokowi Jadi Sekjen PBB 2026? | On...   

                                        text_content  \
0  Publik Geram, Hukum Tumpul di Kasus Silfester?...   
1  Wakil Presiden Gibran Rakabuming Raka Digugat ...   
2  Peran dan Ideologi Partai Politik di Indonesia...   
3  [BREAKING NEWS] Sederet Nama yang Dilantik Men...  

In [3]:
train_df, temp_df = train_test_split(
    df,
    test_size=0.3,
    stratify=df["label_int"],
    random_state=42,
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label_int"],
    random_state=42,
)

for name, part in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    print(name, "size:", len(part), "label distrib:\n", part["label_int"].value_counts())


Train size: 145 label distrib:
 label_int
1    109
0     36
Name: count, dtype: int64
Val size: 31 label distrib:
 label_int
1    23
0     8
Name: count, dtype: int64
Test size: 32 label distrib:
 label_int
1    24
0     8
Name: count, dtype: int64


In [4]:
tokenizer = BertTokenizer.from_pretrained(TEXT_MODEL_PATH)
text_model = BertForSequenceClassification.from_pretrained(TEXT_MODEL_PATH).to(DEVICE)
text_model.eval()

class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row["title"])
        label = int(row["label_int"])

        enc = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }
        return item

def text_collate_fn(batch):
    input_ids = torch.stack([b["input_ids"] for b in batch])
    attn = torch.stack([b["attention_mask"] for b in batch])
    labels = torch.stack([b["label"] for b in batch])
    return {"input_ids": input_ids, "attention_mask": attn, "label": labels}


In [5]:
data_npz = np.load(AUDIO_EMB_PATH, allow_pickle=True)
embeddings_np = data_npz["embeddings"]
ids_np = data_npz["ids"]  # jika di v4 diisi sample_id, bagus
paths_np = data_npz["paths"]

print("Total embedding:", len(embeddings_np))

# buat mapping sample_id -> index embedding
id2idx = {sid: i for i, sid in enumerate(ids_np)}

class AudioEmbDataset(Dataset):
    def __init__(self, df, id2idx, embeddings_np):
        self.df = df.reset_index(drop=True)
        self.id2idx = id2idx
        self.embeddings_np = embeddings_np

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sid = str(row["sample_id"])  # pastikan cocok dengan ids_np
        label = int(row["label_int"])

        if sid in self.id2idx:
            e = self.embeddings_np[self.id2idx[sid]]
        else:
            # jika tidak ada embedding (harusnya jarang): isi nol
            e = np.zeros((1, 768), dtype=np.float32)

        e_t = torch.tensor(e, dtype=torch.float32)  # [T, D] atau [1, D]
        return {"emb": e_t, "label": torch.tensor(label, dtype=torch.long)}

def audio_collate_fn(batch):
    embs = [b["emb"] for b in batch]
    labels = torch.stack([b["label"] for b in batch])

    # pad ke panjang sama di dim T
    max_len = max(e.shape[0] for e in embs)
    dim = embs[0].shape[1]
    padded = torch.zeros(len(embs), max_len, dim, dtype=torch.float32)
    for i, e in enumerate(embs):
        padded[i, :e.shape[0], :] = e

    return {"emb": padded, "label": labels}


Total embedding: 208


In [6]:
# Text loaders
train_text_ds = TextDataset(train_df, tokenizer, MAX_LEN)
val_text_ds   = TextDataset(val_df, tokenizer, MAX_LEN)
test_text_ds  = TextDataset(test_df, tokenizer, MAX_LEN)

train_text_loader = DataLoader(train_text_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=text_collate_fn)
val_text_loader   = DataLoader(val_text_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=text_collate_fn)
test_text_loader  = DataLoader(test_text_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=text_collate_fn)

# Audio loaders
train_audio_ds = AudioEmbDataset(train_df, id2idx, embeddings_np)
val_audio_ds   = AudioEmbDataset(val_df, id2idx, embeddings_np)
test_audio_ds  = AudioEmbDataset(test_df, id2idx, embeddings_np)

train_audio_loader = DataLoader(train_audio_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=audio_collate_fn)
val_audio_loader   = DataLoader(val_audio_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=audio_collate_fn)
test_audio_loader  = DataLoader(test_audio_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=audio_collate_fn)


In [7]:
# definisi head sama persis dengan audio_onlyv4

class AttentivePool(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear = nn.Linear(dim, dim)
        self.context = nn.Linear(dim, 1)

    def forward(self, x):          # x: [B, T, D]
        h = torch.tanh(self.linear(x))
        scores = self.context(h).squeeze(-1)   # [B, T]
        w = torch.softmax(scores, dim=1).unsqueeze(-1)
        return (w * x).sum(dim=1)              # [B, D]

class AudioHeadV4(nn.Module):
    def __init__(self, dim=768, hidden=256, nclass=2):
        super().__init__()
        self.pool = AttentivePool(dim)
        self.fc = nn.Sequential(
            nn.Linear(dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden, nclass),
        )

    def forward(self, x):          # x: [B, T, D]
        pooled = self.pool(x)
        logits = self.fc(pooled)
        return logits

audio_head = AudioHeadV4(dim=768, hidden=256, nclass=2).to(DEVICE)
audio_head.load_state_dict(torch.load(AUDIO_HEAD_PATH, map_location=DEVICE))
audio_head.eval()


  audio_head.load_state_dict(torch.load(AUDIO_HEAD_PATH, map_location=DEVICE))


AudioHeadV4(
  (pool): AttentivePool(
    (linear): Linear(in_features=768, out_features=768, bias=True)
    (context): Linear(in_features=768, out_features=1, bias=True)
  )
  (fc): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=2, bias=True)
  )
)

In [8]:
def get_text_probs(loader):
    all_probs = []
    text_model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attn = batch["attention_mask"].to(DEVICE)
            outputs = text_model(input_ids=input_ids, attention_mask=attn)
            logits = outputs.logits
            probs = softmax(logits, dim=-1)[:, 0]  # prob kelas hoax (label 0)
            all_probs.extend(probs.cpu().numpy().tolist())
    return np.array(all_probs)


def get_audio_probs(loader):
    all_probs = []
    audio_head.eval()
    with torch.no_grad():
        for batch in loader:
            emb = batch["emb"].to(DEVICE)       # [B, T, D]
            logits = audio_head(emb)
            probs = softmax(logits, dim=-1)[:, 0]  # prob hoax
            all_probs.extend(probs.cpu().numpy().tolist())
    return np.array(all_probs)


In [9]:
p_text_train = get_text_probs(train_text_loader)
p_text_val   = get_text_probs(val_text_loader)
p_text_test  = get_text_probs(test_text_loader)

p_audio_train = get_audio_probs(train_audio_loader)
p_audio_val   = get_audio_probs(val_audio_loader)
p_audio_test  = get_audio_probs(test_audio_loader)

y_train = train_df["label_int"].to_numpy()
y_val   = val_df["label_int"].to_numpy()
y_test  = test_df["label_int"].to_numpy()


In [10]:
def evaluate_probs(p_hoax, y_true, thr=0.5):
    y_pred = (p_hoax >= thr).astype(int)   # 1 = hoax? (disesuaikan)
    # HATI-HATI: label_int kita 0=hoax,1=valid â†’ kita mau prediksi 0
    # Lebih aman: definisikan y_pred=0 jika prob hoax>=thr, else 1
    y_pred = np.where(p_hoax >= thr, 0, 1)

    acc = accuracy_score(y_true, y_pred)
    f1_hoax = f1_score(y_true, y_pred, pos_label=0)
    f1_valid = f1_score(y_true, y_pred, pos_label=1)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    return acc, f1_hoax, f1_valid, f1_macro

def fusion_weighted(p_text, p_audio, alpha):
    return alpha * p_text + (1 - alpha) * p_audio

alphas = np.linspace(0, 1, 11)
records = []
for a in alphas:
    p_fuse_val = fusion_weighted(p_text_val, p_audio_val, a)
    acc, f1h, f1v, f1m = evaluate_probs(p_fuse_val, y_val)
    records.append((a, acc, f1h, f1v, f1m))

df_alpha = pd.DataFrame(records, columns=["alpha_text", "acc", "f1_hoax", "f1_valid", "f1_macro"])
print(df_alpha)
best_row = df_alpha.sort_values("f1_macro", ascending=False).iloc[0]
best_alpha = float(best_row["alpha_text"])
print("Best alpha_text:", best_alpha)


    alpha_text       acc   f1_hoax  f1_valid  f1_macro
0          0.0  0.806452  0.666667  0.863636  0.765152
1          0.1  0.903226  0.800000  0.936170  0.868085
2          0.2  0.838710  0.615385  0.897959  0.756672
3          0.3  0.806452  0.500000  0.880000  0.690000
4          0.4  0.838710  0.615385  0.897959  0.756672
5          0.5  0.806452  0.571429  0.875000  0.723214
6          0.6  0.774194  0.533333  0.851064  0.692199
7          0.7  0.774194  0.533333  0.851064  0.692199
8          0.8  0.774194  0.533333  0.851064  0.692199
9          0.9  0.774194  0.533333  0.851064  0.692199
10         1.0  0.774194  0.533333  0.851064  0.692199
Best alpha_text: 0.1


In [11]:
# Text-only dan audio-only sebagai pembanding
acc_t, f1h_t, f1v_t, f1m_t = evaluate_probs(p_text_test, y_test)
acc_a, f1h_a, f1v_a, f1m_a = evaluate_probs(p_audio_test, y_test)

# Fusion
p_fuse_test = fusion_weighted(p_text_test, p_audio_test, best_alpha)
acc_f, f1h_f, f1v_f, f1m_f = evaluate_probs(p_fuse_test, y_test)

print("=== Test Results ===")
print("Text only  : acc={:.4f} f1h={:.4f} f1v={:.4f} f1m={:.4f}".format(acc_t, f1h_t, f1v_t, f1m_t))
print("Audio only : acc={:.4f} f1h={:.4f} f1v={:.4f} f1m={:.4f}".format(acc_a, f1h_a, f1v_a, f1m_a))
print("Fusion     : acc={:.4f} f1h={:.4f} f1v={:.4f} f1m={:.4f}".format(acc_f, f1h_f, f1v_f, f1m_f))

print("\nClassification report (fusion):")
y_pred_f = np.where(p_fuse_test >= 0.5, 0, 1)
print(classification_report(y_test, y_pred_f, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_f))


=== Test Results ===
Text only  : acc=0.7188 f1h=0.4000 f1v=0.8163 f1m=0.6082
Audio only : acc=0.8125 f1h=0.6250 f1v=0.8750 f1m=0.7500
Fusion     : acc=0.8125 f1h=0.5714 f1v=0.8800 f1m=0.7257

Classification report (fusion):
              precision    recall  f1-score   support

           0     0.6667    0.5000    0.5714         8
           1     0.8462    0.9167    0.8800        24

    accuracy                         0.8125        32
   macro avg     0.7564    0.7083    0.7257        32
weighted avg     0.8013    0.8125    0.8029        32

Confusion matrix:
 [[ 4  4]
 [ 2 22]]


In [12]:
import joblib
import numpy as np

class TextAudioLateFusion:
    def __init__(self, alpha_text=0.1):
        self.alpha_text = float(alpha_text)
        self.alpha_audio = 1.0 - float(alpha_text)

    def predict_proba(self, p_text, p_audio):
        """
        p_text : array shape (N,2) atau (N,) prob hoax
        p_audio: array shape (N,2) atau (N,) prob hoax
        return : probs_fusion shape (N,2)
        """
        if p_text.ndim == 2:
            p_text_hoax = p_text[:, 0]
        else:
            p_text_hoax = p_text

        if p_audio.ndim == 2:
            p_audio_hoax = p_audio[:, 0]
        else:
            p_audio_hoax = p_audio

        p_hoax = self.alpha_text * p_text_hoax + self.alpha_audio * p_audio_hoax
        p_valid = 1.0 - p_hoax
        return np.vstack([p_hoax, p_valid]).T

    def predict(self, p_text, p_audio):
        probs = self.predict_proba(p_text, p_audio)
        # 0 = hoax, 1 = valid
        return (probs[:, 0] < 0.5).astype(int)

# bikin instance dengan alpha terbaik
fusion_model = TextAudioLateFusion(alpha_text=best_alpha)

# simpan ke file
joblib.dump(fusion_model, r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\fusion_final/text_audio_late_fusion_alpha0_1.joblib")
print("Saved late fusion model to text_audio_late_fusion_alpha0_1.joblib")


Saved late fusion model to text_audio_late_fusion_alpha0_1.joblib
