In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import torch
import torch.nn as nn
from torch.nn.functional import softmax
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertForSequenceClassification
from torchvision import transforms
from PIL import Image

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# Paths
DATA_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\data\training\multimodal_splits\text_image_audio_dataset.csv"

TEXT_MODEL_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\text_baseline\indobert-base-p1"
IMAGE_MODEL_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\image_baseline\best_mobilenetv3_tf_style.pth"
AUDIO_HEAD_PATH = r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\audio_baseline\best_audio_wav2vec2.pt"
AUDIO_EMB_PATH = "audio_embeddings_precomputed.npz"

MAX_LEN = 128
BATCH_SIZE = 16


Device: cuda


In [72]:
df = pd.read_csv(DATA_PATH)

label_map = {"hoax": 0, "valid": 1}
df["label_int"] = df["label"].map(label_map)

print(df.head())
print("Total data:", len(df))
print("Label distrib total:\n", df["label_int"].value_counts())


  sample_id label     data_source  confidence  sample_weight  \
0  YT_00081  hoax  predicted_hoax    0.905624       0.820156   
1  YT_00867  hoax  predicted_hoax    0.850997       0.724195   
2  YT_00904  hoax  predicted_hoax    0.772077       0.596103   
3  YT_01161  hoax  predicted_hoax    0.755714       0.571103   
4  YT_00039  hoax  predicted_hoax    0.711433       0.506137   

                                               title  \
0  Publik Geram, Hukum Tumpul di Kasus Silfester?...   
1  Wakil Presiden Gibran Rakabuming Raka Digugat ...   
2  Peran dan Ideologi Partai Politik di Indonesia...   
3  [BREAKING NEWS] Sederet Nama yang Dilantik Men...   
4  Apakah Benar Jokowi Jadi Sekjen PBB 2026? | On...   

                                        text_content  \
0  Publik Geram, Hukum Tumpul di Kasus Silfester?...   
1  Wakil Presiden Gibran Rakabuming Raka Digugat ...   
2  Peran dan Ideologi Partai Politik di Indonesia...   
3  [BREAKING NEWS] Sederet Nama yang Dilantik Men...  

In [None]:
train_df, temp_df = train_test_split(
    df,
    test_size=0.3,               
    stratify=df["label_int"],
    random_state=42,
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,              
    stratify=temp_df["label_int"],
    random_state=42,
)

for name, part in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    print(f"{name} size: {len(part)}")
    print(part["label_int"].value_counts(), "\n")


Train size: 145
label_int
1    109
0     36
Name: count, dtype: int64 

Val size: 31
label_int
1    23
0     8
Name: count, dtype: int64 

Test size: 32
label_int
1    24
0     8
Name: count, dtype: int64 



In [74]:
tokenizer = BertTokenizer.from_pretrained(TEXT_MODEL_PATH)
text_model = BertForSequenceClassification.from_pretrained(TEXT_MODEL_PATH).to(DEVICE)
text_model.eval()

class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row["title"])   # atau "text_content" jika ingin
        label = int(row["label_int"])

        enc = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

def text_collate_fn(batch):
    input_ids = torch.stack([b["input_ids"] for b in batch])
    attn = torch.stack([b["attention_mask"] for b in batch])
    labels = torch.stack([b["label"] for b in batch])
    return {"input_ids": input_ids, "attention_mask": attn, "label": labels}


In [75]:
data_npz = np.load(AUDIO_EMB_PATH, allow_pickle=True)
embeddings_np = data_npz["embeddings"]
ids_np = data_npz["ids"]

print("Total audio embeddings:", len(embeddings_np))

# mapping sample_id -> index embedding
id2idx = {str(sid): i for i, sid in enumerate(ids_np)}

class AudioEmbDataset(Dataset):
    def __init__(self, df, id2idx, embeddings_np):
        self.df = df.reset_index(drop=True)
        self.id2idx = id2idx
        self.embeddings_np = embeddings_np

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sid = str(row["sample_id"])
        label = int(row["label_int"])

        if sid in self.id2idx:
            e = self.embeddings_np[self.id2idx[sid]]
        else:
            e = np.zeros((1, 768), dtype=np.float32)

        e_t = torch.tensor(e, dtype=torch.float32)  # [T, D] atau [1, D]
        return {"emb": e_t, "label": torch.tensor(label, dtype=torch.long)}

def audio_collate_fn(batch):
    embs = [b["emb"] for b in batch]
    labels = torch.stack([b["label"] for b in batch])

    max_len = max(e.shape[0] for e in embs)
    dim = embs[0].shape[1]
    padded = torch.zeros(len(embs), max_len, dim, dtype=torch.float32)
    for i, e in enumerate(embs):
        padded[i, :e.shape[0], :] = e

    return {"emb": padded, "label": labels}


Total audio embeddings: 208


In [76]:
# Transform disesuaikan dengan training mobilenetv3_tf_style
img_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],   # ganti kalau di model_gambar pakai mean/std lain
        std=[0.229, 0.224, 0.225],
    ),
])

class ImageDataset(Dataset):
    def __init__(self, df, transform=None, img_size=(224, 224)):
        self.df = df.reset_index(drop=True)
        self.transform = transform or (lambda x: x)
        self.img_size = img_size

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        rel_path = row["image_path"]         
        img_path = os.path.join(
            r"D:\INDONERIS-DATAMINING\multimodal-hoax-detection", 
            rel_path.lstrip("./\\")                              
        )
        label = int(row["label_int"])

        try:
            img = Image.open(img_path).convert("RGB")
        except (FileNotFoundError, OSError):
            img = Image.new("RGB", self.img_size, color=(128, 128, 128))

        img_t = self.transform(img)
        return {"image": img_t, "label": torch.tensor(label, dtype=torch.long)}


def image_collate_fn(batch):
    imgs = torch.stack([b["image"] for b in batch])
    labels = torch.stack([b["label"] for b in batch])
    return {"image": imgs, "label": labels}


In [77]:
# Text loaders
train_text_ds = TextDataset(train_df, tokenizer, MAX_LEN)
val_text_ds   = TextDataset(val_df, tokenizer, MAX_LEN)
test_text_ds  = TextDataset(test_df, tokenizer, MAX_LEN)

train_text_loader = DataLoader(train_text_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=text_collate_fn)
val_text_loader   = DataLoader(val_text_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=text_collate_fn)
test_text_loader  = DataLoader(test_text_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=text_collate_fn)

# Audio loaders
train_audio_ds = AudioEmbDataset(train_df, id2idx, embeddings_np)
val_audio_ds   = AudioEmbDataset(val_df, id2idx, embeddings_np)
test_audio_ds  = AudioEmbDataset(test_df, id2idx, embeddings_np)

train_audio_loader = DataLoader(train_audio_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=audio_collate_fn)
val_audio_loader   = DataLoader(val_audio_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=audio_collate_fn)
test_audio_loader  = DataLoader(test_audio_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=audio_collate_fn)

# Image loaders (base_dir bisa diisi root folder gambar kalau di CSV relatif)
train_img_ds = ImageDataset(train_df, transform=img_transform)
val_img_ds   = ImageDataset(val_df,   transform=img_transform)
test_img_ds  = ImageDataset(test_df,  transform=img_transform)


train_img_loader = DataLoader(train_img_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=image_collate_fn)
val_img_loader   = DataLoader(val_img_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=image_collate_fn)
test_img_loader  = DataLoader(test_img_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=image_collate_fn)


In [78]:
# Audio head (harus sama definisi dgn audio_onlyv4)

class AttentivePool(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear = nn.Linear(dim, dim)
        self.context = nn.Linear(dim, 1)

    def forward(self, x):          # x: [B, T, D]
        h = torch.tanh(self.linear(x))
        scores = self.context(h).squeeze(-1)   # [B, T]
        w = torch.softmax(scores, dim=1).unsqueeze(-1)
        return (w * x).sum(dim=1)              # [B, D]


class AudioHeadV4(nn.Module):
    def __init__(self, dim=768, hidden=256, nclass=2):
        super().__init__()
        self.pool = AttentivePool(dim)
        self.fc = nn.Sequential(
            nn.Linear(dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden, nclass),
        )

    def forward(self, x):          # x: [B, T, D]
        pooled = self.pool(x)
        logits = self.fc(pooled)
        return logits


audio_head = AudioHeadV4(dim=768, hidden=256, nclass=2).to(DEVICE)
audio_state = torch.load(AUDIO_HEAD_PATH, map_location=DEVICE)
audio_head.load_state_dict(audio_state)
audio_head.eval()



from torchvision import models

def create_mobilenetv3_tf_style(num_classes=2):

    model = models.mobilenet_v3_small(weights=None)
    in_features = model.classifier[-1].in_features
    model.classifier[-1] = nn.Linear(in_features, num_classes)
    return model

image_model = create_mobilenetv3_tf_style(num_classes=2)

img_state = torch.load(IMAGE_MODEL_PATH, map_location=DEVICE)
image_model.load_state_dict(img_state)

image_model.to(DEVICE)
image_model.eval()


  audio_state = torch.load(AUDIO_HEAD_PATH, map_location=DEVICE)
  img_state = torch.load(IMAGE_MODEL_PATH, map_location=DEVICE)


MobileNetV3(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): Hardswish()
    )
    (1): InvertedResidual(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (1): SqueezeExcitation(
          (avgpool): AdaptiveAvgPool2d(output_size=1)
          (fc1): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1))
          (fc2): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1))
          (activation): ReLU()
          (scale_activation): Hardsigmoid()
        )
        (2): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(1, 1), 

In [79]:
def get_text_probs(loader):
    all_probs = []
    text_model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attn = batch["attention_mask"].to(DEVICE)
            outputs = text_model(input_ids=input_ids, attention_mask=attn)
            logits = outputs.logits
            probs = softmax(logits, dim=-1)[:, 0]  # prob hoax (kelas 0)
            all_probs.extend(probs.cpu().numpy().tolist())
    return np.array(all_probs)

def get_audio_probs(loader):
    all_probs = []
    audio_head.eval()
    with torch.no_grad():
        for batch in loader:
            emb = batch["emb"].to(DEVICE)
            logits = audio_head(emb)
            probs = softmax(logits, dim=-1)[:, 0]  # prob hoax
            all_probs.extend(probs.cpu().numpy().tolist())
    return np.array(all_probs)

def get_image_probs(loader):
    all_probs = []
    image_model.eval()
    with torch.no_grad():
        for batch in loader:
            imgs = batch["image"].to(DEVICE)
            logits = image_model(imgs)
            probs = softmax(logits, dim=-1)[:, 0]  # prob hoax
            all_probs.extend(probs.cpu().numpy().tolist())
    return np.array(all_probs)


In [80]:
# Train
p_text_train  = get_text_probs(train_text_loader)
p_audio_train = get_audio_probs(train_audio_loader)
p_img_train   = get_image_probs(train_img_loader)

# Val
p_text_val  = get_text_probs(val_text_loader)
p_audio_val = get_audio_probs(val_audio_loader)
p_img_val   = get_image_probs(val_img_loader)

# Test
p_text_test  = get_text_probs(test_text_loader)
p_audio_test = get_audio_probs(test_audio_loader)
p_img_test   = get_image_probs(test_img_loader)

y_train = train_df["label_int"].to_numpy()
y_val   = val_df["label_int"].to_numpy()
y_test  = test_df["label_int"].to_numpy()


In [81]:
def evaluate_probs(p_hoax, y_true, thr=0.5):
    # label_int: 0=hoax,1=valid â†’ pred 0 jika prob hoax>=thr, else 1
    y_pred = np.where(p_hoax >= thr, 0, 1)

    acc = accuracy_score(y_true, y_pred)
    f1_hoax = f1_score(y_true, y_pred, pos_label=0)
    f1_valid = f1_score(y_true, y_pred, pos_label=1)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    return acc, f1_hoax, f1_valid, f1_macro

def fusion_two(p_a, p_b, alpha):
    return alpha * p_a + (1 - alpha) * p_b


In [82]:
alphas = np.linspace(0, 1, 11)

records_ta = []
for a in alphas:
    p_ta_val = fusion_two(p_text_val, p_audio_val, a)
    acc, f1h, f1v, f1m = evaluate_probs(p_ta_val, y_val)
    records_ta.append((a, acc, f1h, f1v, f1m))

df_ta = pd.DataFrame(records_ta, columns=["alpha_text", "acc", "f1_hoax", "f1_valid", "f1_macro"])
print(df_ta.sort_values("f1_macro", ascending=False).head())

best_row_ta = df_ta.sort_values("f1_macro", ascending=False).iloc[0]
alpha_text_best = float(best_row_ta["alpha_text"])
print("Best alpha_text (text+audio):", alpha_text_best)

# p_ta untuk semua split
p_ta_train = fusion_two(p_text_train, p_audio_train, alpha_text_best)
p_ta_val   = fusion_two(p_text_val,  p_audio_val,  alpha_text_best)
p_ta_test  = fusion_two(p_text_test, p_audio_test, alpha_text_best)


   alpha_text       acc   f1_hoax  f1_valid  f1_macro
1         0.1  0.903226  0.800000  0.936170  0.868085
0         0.0  0.806452  0.666667  0.863636  0.765152
2         0.2  0.838710  0.615385  0.897959  0.756672
4         0.4  0.838710  0.615385  0.897959  0.756672
5         0.5  0.806452  0.571429  0.875000  0.723214
Best alpha_text (text+audio): 0.1


In [83]:
betas = np.linspace(0, 1, 11)

records_tai = []
for b in betas:
    p_tai_val = fusion_two(p_ta_val, p_img_val, b)  # b untuk p_ta, (1-b) untuk img
    acc, f1h, f1v, f1m = evaluate_probs(p_tai_val, y_val)
    records_tai.append((b, acc, f1h, f1v, f1m))

df_tai = pd.DataFrame(records_tai, columns=["beta_ta", "acc", "f1_hoax", "f1_valid", "f1_macro"])
print(df_tai.sort_values("f1_macro", ascending=False).head())

best_row_tai = df_tai.sort_values("f1_macro", ascending=False).iloc[0]
beta_ta_best = float(best_row_tai["beta_ta"])
print("Best beta_ta (text+audio vs img):", beta_ta_best)


    beta_ta       acc   f1_hoax  f1_valid  f1_macro
10      1.0  0.903226  0.800000  0.936170  0.868085
9       0.9  0.870968  0.714286  0.916667  0.815476
0       0.0  0.774194  0.363636  0.862745  0.613191
1       0.1  0.774194  0.363636  0.862745  0.613191
2       0.2  0.774194  0.363636  0.862745  0.613191
Best beta_ta (text+audio vs img): 1.0


In [84]:
# Text-only
acc_t, f1h_t, f1v_t, f1m_t = evaluate_probs(p_text_test, y_test)

# Audio-only
acc_a, f1h_a, f1v_a, f1m_a = evaluate_probs(p_audio_test, y_test)

# Image-only
acc_i, f1h_i, f1v_i, f1m_i = evaluate_probs(p_img_test, y_test)

# Text+Audio
acc_ta, f1h_ta, f1v_ta, f1m_ta = evaluate_probs(p_ta_test, y_test)

# Text+Audio+Image
p_tai_test = fusion_two(p_ta_test, p_img_test, beta_ta_best)
acc_tai, f1h_tai, f1v_tai, f1m_tai = evaluate_probs(p_tai_test, y_test)

print("=== Test Results ===")
print("Text        : acc={:.4f} f1h={:.4f} f1v={:.4f} f1m={:.4f}".format(acc_t,  f1h_t,  f1v_t,  f1m_t))
print("Audio       : acc={:.4f} f1h={:.4f} f1v={:.4f} f1m={:.4f}".format(acc_a,  f1h_a,  f1v_a,  f1m_a))
print("Image       : acc={:.4f} f1h={:.4f} f1v={:.4f} f1m={:.4f}".format(acc_i,  f1h_i,  f1v_i,  f1m_i))
print("Text+Audio  : acc={:.4f} f1h={:.4f} f1v={:.4f} f1m={:.4f}".format(acc_ta, f1h_ta, f1v_ta, f1m_ta))
print("T+A+Image   : acc={:.4f} f1h={:.4f} f1v={:.4f} f1m={:.4f}".format(acc_tai, f1h_tai, f1v_tai, f1m_tai))

print("\nClassification report (T+A+Image):")
y_pred_tai = np.where(p_tai_test >= 0.5, 0, 1)
print(classification_report(y_test, y_pred_tai, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_tai))


=== Test Results ===
Text        : acc=0.7188 f1h=0.4000 f1v=0.8163 f1m=0.6082
Audio       : acc=0.8125 f1h=0.6250 f1v=0.8750 f1m=0.7500
Image       : acc=0.6875 f1h=0.4444 f1v=0.7826 f1m=0.6135
Text+Audio  : acc=0.8125 f1h=0.5714 f1v=0.8800 f1m=0.7257
T+A+Image   : acc=0.8125 f1h=0.5714 f1v=0.8800 f1m=0.7257

Classification report (T+A+Image):
              precision    recall  f1-score   support

           0     0.6667    0.5000    0.5714         8
           1     0.8462    0.9167    0.8800        24

    accuracy                         0.8125        32
   macro avg     0.7564    0.7083    0.7257        32
weighted avg     0.8013    0.8125    0.8029        32

Confusion matrix:
 [[ 4  4]
 [ 2 22]]


In [85]:
import joblib
import numpy as np

class TextImageAudioLateFusion:
    def __init__(self, alpha_text=0.0, beta_ta=1.0):
        """
        alpha_text : bobot teks dalam fusion text+audio
        beta_ta    : bobot (text+audio) vs image
        """
        self.alpha_text = float(alpha_text)
        self.alpha_audio = 1.0 - float(alpha_text)
        self.beta_ta = float(beta_ta)

    def fusion_text_audio(self, p_text_hoax, p_audio_hoax):
        # p_*_hoax: array shape (N,) berisi prob hoax
        return self.alpha_text * p_text_hoax + self.alpha_audio * p_audio_hoax

    def fusion_all(self, p_text_hoax, p_audio_hoax, p_img_hoax):
        """
        Menghasilkan probabilitas [p_hoax, p_valid] untuk 3 modal.
        """
        p_ta = self.fusion_text_audio(p_text_hoax, p_audio_hoax)
        p_hoax = self.beta_ta * p_ta + (1.0 - self.beta_ta) * p_img_hoax
        p_valid = 1.0 - p_hoax
        return np.vstack([p_hoax, p_valid]).T   # shape (N, 2)

    def predict(self, p_text_hoax, p_audio_hoax, p_img_hoax, thr=0.5):
        probs = self.fusion_all(p_text_hoax, p_audio_hoax, p_img_hoax)
        # label: 0 = hoax, 1 = valid
        return (probs[:, 0] < thr).astype(int)


In [88]:
fusion_tai = TextImageAudioLateFusion(
    alpha_text=alpha_text_best,
    beta_ta=beta_ta_best,
)

fname = rf"D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\fusion_final\text_image_audio_late_fusion_a{alpha_text_best:.2f}_b{beta_ta_best:.2f}.joblib"

joblib.dump(fusion_tai, fname)
print("Saved 3-modal fusion model to:", fname)


Saved 3-modal fusion model to: D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\fusion_final\text_image_audio_late_fusion_a0.10_b1.00.joblib
