In [None]:
import pandas as pd
import numpy as np
import torch
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("Semua library berhasil diimport")
print(f"Versi PyTorch: {torch.__version__}")
print(f"CUDA tersedia: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device GPU: {torch.cuda.get_device_name(0)}")


Semua library berhasil diimport
Versi PyTorch: 2.9.1+cpu
CUDA tersedia: False


In [2]:
print("=" * 80)
print("TAHAP 1: MEMUAT DAN EKSPLORASI DATA")
print("=" * 80)

# Baca dataset
df = pd.read_csv('D:\INDONERIS-DATAMINING\multimodal-hoax-detection\data\processed\dataset_clean_finalv1.csv')
print(f"\nJumlah data: {len(df):,} baris")
print(f"Jumlah kolom: {len(df.columns)}")

# Tampilkan kolom
print(f"\nNama kolom:")
print(df.columns.tolist())

# Tampilkan 5 data pertama
print("\nContoh data:")
print(df[['title', 'label_str', 'source_type']].head())

# Distribusi label
print("\nDistribusi label:")
print(df['label_str'].value_counts())


TAHAP 1: MEMUAT DAN EKSPLORASI DATA

Jumlah data: 3,068 baris
Jumlah kolom: 12

Nama kolom:
['id', 'url', 'domain', 'date', 'title', 'text', 'source_type', 'dataset_origin', 'fp_new', 'label', 'label_str', 'text_clean']

Contoh data:
                                               title label_str source_type
0  Profil Menko Hukum HAM Yusril Ihza Mahendra di...     valid        news
1  Profil Menteri PPPA Arifatul Choiri Fauzi di K...     valid        news
2  Prabowo Tunjuk Yassierli Jadi Menteri Ketenaga...     valid        news
3  Profil Yassierli, Menteri Ketenagakerjaan Kabi...     valid        news
4  Apa yang Harus Dilakukan di Usia 30 Tahun untu...     valid        news

Distribusi label:
label_str
valid    1895
hoax     1173
Name: count, dtype: int64


In [None]:
print("=" * 80)
print("TAHAP 2: PEMBERSIHAN DATA")
print("=" * 80)

# Gunakan title sebagai fitur
df['title_clean'] = df['title'].str.lower().str.strip()

# Hapus data kosong dan duplikat
print("\nMembersihkan data...")
ukuran_awal = len(df)

df = df.dropna(subset=['title_clean'])
df['jumlah_kata'] = df['title_clean'].str.split().str.len()
df = df[df['jumlah_kata'] >= 3]
df = df.drop_duplicates(subset=['title_clean'])

print(f"Ukuran awal: {ukuran_awal:,} baris")
print(f"Setelah dibersihkan: {len(df):,} baris")
print(f"Data yang dihapus: {ukuran_awal - len(df):,} baris")

# Statistik title
print("\nStatistik jumlah kata pada title:")
print(f"Valid - Rata-rata: {df[df['label_str']=='valid']['jumlah_kata'].mean():.1f} kata")
print(f"Hoax  - Rata-rata: {df[df['label_str']=='hoax']['jumlah_kata'].mean():.1f} kata")

# Contoh title
print("\nContoh title per kategori:")
for label in ['valid', 'hoax']:
    print(f"\n{label.upper()}:")
    sampel = df[df['label_str']==label].sample(2, random_state=42)
    for idx, row in sampel.iterrows():
        print(f"  - {row['title_clean']}")


TAHAP 2: PEMBERSIHAN DATA

Membersihkan data...
Ukuran awal: 3,068 baris
Setelah dibersihkan: 3,067 baris
Data yang dihapus: 1 baris

Statistik jumlah kata pada title:
Valid - Rata-rata: 10.4 kata
Hoax  - Rata-rata: 8.4 kata

Contoh title per kategori:

VALID:
  - difoto pakai kamera android, ariel noah: beda banget experience-nya
  - kronologi bripka anditya gugur di pantai pangandaran saat tolong wisatawan, diberi kenaikan pangkat

HOAX:
  - jokowi dan luhut terima uang rp4,5 t dari nadiem makarim
  - eksploitasi peristiwa pembakaran gedung bawaslu


In [None]:
print("=" * 80)
print("TAHAP 3: SPLIT DATA TRAINING DAN TESTING")
print("=" * 80)

# Split stratified
train_df, test_df = train_test_split(
    df, 
    test_size=0.2, 
    stratify=df['label'], 
    random_state=42
)

print(f"\nData training: {len(train_df):,} baris")
print(f"Data testing:  {len(test_df):,} baris")

# Siapkan array
X_train = train_df['title_clean'].values
y_train = train_df['label'].values
X_test = test_df['title_clean'].values
y_test = test_df['label'].values

# Distribusi label
print("\nDistribusi label training:")
print(train_df['label_str'].value_counts())
print("\nDistribusi label testing:")
print(test_df['label_str'].value_counts())


TAHAP 3: SPLIT DATA TRAINING DAN TESTING

Data training: 2,453 baris
Data testing:  614 baris

Distribusi label training:
label_str
valid    1515
hoax      938
Name: count, dtype: int64

Distribusi label testing:
label_str
valid    379
hoax     235
Name: count, dtype: int64


In [None]:
print("=" * 80)
print("TAHAP 4: PEMBUATAN FITUR TF-IDF")
print("=" * 80)

# Inisialisasi TF-IDF
tfidf = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.7,
    sublinear_tf=True
)

# Transform data
print("\nMembuat fitur TF-IDF...")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"Ukuran vocabulary: {len(tfidf.vocabulary_):,}")
print(f"Dimensi training: {X_train_tfidf.shape}")
print(f"Dimensi testing:  {X_test_tfidf.shape}")

# Contoh fitur teratas
fitur_nama = tfidf.get_feature_names_out()
print(f"\n10 fitur pertama:")
for nama in fitur_nama[:10]:
    print(f"  - {nama}")


TAHAP 4: PEMBUATAN FITUR TF-IDF

Membuat fitur TF-IDF...
Ukuran vocabulary: 3,000
Dimensi training: (2453, 3000)
Dimensi testing:  (614, 3000)

10 fitur pertama:
  - 000
  - 000 dapat
  - 000 dapat berapa
  - 10
  - 10 juta
  - 10 persen
  - 10 tahun
  - 100
  - 100 hari
  - 11


In [None]:
print("=" * 80)
print("TAHAP 5: PERHITUNGAN BOBOT KELAS")
print("=" * 80)

# Hitung bobot
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

print("\nBobot kelas:")
print(f"  Hoax (0):  {class_weight_dict[0]:.4f}")
print(f"  Valid (1): {class_weight_dict[1]:.4f}")
print("\nKeterangan: Bobot lebih tinggi untuk kelas minoritas")


TAHAP 5: PERHITUNGAN BOBOT KELAS

Bobot kelas:
  Hoax (0):  1.3076
  Valid (1): 0.8096

Keterangan: Bobot lebih tinggi untuk kelas minoritas


In [None]:
print("=" * 80)
print("TAHAP 6: TRAINING MODEL TF-IDF + LOGISTIC REGRESSION")
print("=" * 80)

# Inisialisasi model
logreg = LogisticRegression(
    class_weight=class_weight_dict,
    max_iter=1000,
    C=1.0,
    random_state=42,
    verbose=1
)

# Training
print("\nMemulai training...")
logreg.fit(X_train_tfidf, y_train)
print("Training selesai")

# Simpan model
joblib.dump(tfidf, r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\pseudo_labelling\tf-idf/tfidf_title_vectorizer.pkl')
joblib.dump(logreg, r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\pseudo_labelling\tf-idf/logreg_title_model.pkl')
print("Model tersimpan")


TAHAP 6: TRAINING MODEL TF-IDF + LOGISTIC REGRESSION

Memulai training...


Training selesai
Model tersimpan


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [None]:
print("=" * 80)
print("TAHAP 6: TRAINING MODEL TF-IDF + LOGISTIC REGRESSION")
print("=" * 80)

# Inisialisasi model
logreg = LogisticRegression(
    class_weight=class_weight_dict,
    max_iter=1000,
    C=1.0,
    random_state=42,
    verbose=1
)

# Training
print("\nMemulai training...")
logreg.fit(X_train_tfidf, y_train)
print("Training selesai")

# Simpan model
joblib.dump(tfidf, r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\pseudo_labelling\tf-idf/tfidf_title_vectorizer.pkl')
joblib.dump(logreg, r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\pseudo_labelling\tf-idf/logreg_title_model.pkl')
print("Model tersimpan")


TAHAP 6: TRAINING MODEL TF-IDF + LOGISTIC REGRESSION

Memulai training...
Training selesai
Model tersimpan


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [None]:
print("=" * 80)
print("TAHAP 7: EVALUASI MODEL TF-IDF")
print("=" * 80)

# Prediksi
y_pred_tfidf = logreg.predict(X_test_tfidf)
y_proba_tfidf = logreg.predict_proba(X_test_tfidf)[:, 1]

# Metrik
acc_tfidf = accuracy_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')
auc_tfidf = roc_auc_score(y_test, y_proba_tfidf)

print("\nMETRIK PERFORMA:")
print(f"Accuracy:  {acc_tfidf:.4f}")
print(f"F1-Score:  {f1_tfidf:.4f}")
print(f"AUC-ROC:   {auc_tfidf:.4f}")

print("\nLAPORAN KLASIFIKASI:")
print(classification_report(y_test, y_pred_tfidf, target_names=['hoax', 'valid']))

print("\nMATRIKS KONFUSI:")
cm_tfidf = confusion_matrix(y_test, y_pred_tfidf)
print(cm_tfidf)
print(f"\nTrue Negative:  {cm_tfidf[0,0]}")
print(f"False Positive: {cm_tfidf[0,1]}")
print(f"False Negative: {cm_tfidf[1,0]}")
print(f"True Positive:  {cm_tfidf[1,1]}")


TAHAP 7: EVALUASI MODEL TF-IDF

METRIK PERFORMA:
Accuracy:  0.8550
F1-Score:  0.8556
AUC-ROC:   0.9360

LAPORAN KLASIFIKASI:
              precision    recall  f1-score   support

        hoax       0.80      0.83      0.81       235
       valid       0.89      0.87      0.88       379

    accuracy                           0.86       614
   macro avg       0.85      0.85      0.85       614
weighted avg       0.86      0.86      0.86       614


MATRIKS KONFUSI:
[[196  39]
 [ 50 329]]

True Negative:  196
False Positive: 39
False Negative: 50
True Positive:  329


In [None]:
print("=" * 80)
print("TAHAP 8: MEMUAT MODEL INDOBERT")
print("=" * 80)

# Load model
model_name = "indolem/indobert-base-uncased"
print(f"\nMemuat {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

print("IndoBERT berhasil dimuat")
print(f"Jumlah parameter model: {model.num_parameters():,}")

# Pindahkan model ke GPU jika tersedia
if torch.cuda.is_available():
    model = model.cuda()
    print("Model dipindahkan ke GPU")


TAHAP 8: MEMUAT MODEL INDOBERT

Memuat indolem/indobert-base-uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndoBERT berhasil dimuat
Jumlah parameter model: 110,559,746


In [None]:
print("=" * 80)
print("TAHAP 9: TOKENISASI DATA")
print("=" * 80)

print("\nMemulai tokenisasi...")

# Tokenisasi
train_encodings = tokenizer(
    train_df['title_clean'].tolist(),
    truncation=True,
    padding=True,
    max_length=64
)

test_encodings = tokenizer(
    test_df['title_clean'].tolist(),
    truncation=True,
    padding=True,
    max_length=64
)

print("Tokenisasi training selesai")
print("Tokenisasi testing selesai")
print(f"\nPanjang maksimum sequence: 64 token")
print(f"Ukuran vocabulary: {len(tokenizer)}")


TAHAP 9: TOKENISASI DATA

Memulai tokenisasi...
Tokenisasi training selesai
Tokenisasi testing selesai

Panjang maksimum sequence: 64 token
Ukuran vocabulary: 31923


In [None]:
print("=" * 80)
print("TAHAP 10: PEMBUATAN PYTORCH DATASET")
print("=" * 80)

class HoaxDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

# Buat dataset
train_dataset = HoaxDataset(train_encodings, train_df['label'].tolist())
test_dataset = HoaxDataset(test_encodings, test_df['label'].tolist())

print(f"Dataset training: {len(train_dataset)} sampel")
print(f"Dataset testing:  {len(test_dataset)} sampel")


TAHAP 10: PEMBUATAN PYTORCH DATASET
Dataset training: 2453 sampel
Dataset testing:  614 sampel


In [None]:
print("=" * 80)
print("TAHAP 11: DEFINISI CUSTOM TRAINER")
print("=" * 80)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Loss dengan bobot kelas
        class_weights_tensor = torch.tensor(
            [class_weights[0], class_weights[1]], 
            dtype=torch.float
        ).to(logits.device)
        
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

print("Custom Trainer berhasil didefinisikan")
print("Bobot kelas akan diterapkan saat training")


TAHAP 11: DEFINISI CUSTOM TRAINER
Custom Trainer berhasil didefinisikan
Bobot kelas akan diterapkan saat training


In [None]:
print("=" * 80)
print("TAHAP 12: KONFIGURASI TRAINING")
print("=" * 80)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_steps=200,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=50,
    logging_dir='./logs',
)

print("Konfigurasi training:")
print(f"  - Jumlah epoch: {training_args.num_train_epochs}")
print(f"  - Ukuran batch: {training_args.per_device_train_batch_size}")
print(f"  - Learning rate: {training_args.learning_rate}")
print(f"  - Warmup steps: {training_args.warmup_steps}")


TAHAP 12: KONFIGURASI TRAINING
Konfigurasi training:
  - Jumlah epoch: 5
  - Ukuran batch: 32
  - Learning rate: 3e-05
  - Warmup steps: 200


In [17]:
print("=" * 80)
print("TAHAP 13: TRAINING MODEL INDOBERT")
print("=" * 80)
print("\nPERINGATAN: Training akan memakan waktu 15-30 menit!")
print("Silakan tunggu hingga selesai...\n")

# Inisialisasi trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Mulai training
trainer.train()

print("\nTraining IndoBERT selesai")

# Simpan model
model.save_pretrained(r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\pseudo_labelling/indobert_title_model')
tokenizer.save_pretrained(r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\pseudo_labelling/indobert_title_model')
print("Model tersimpan di folder 'indobert_title_model'")


TAHAP 13: TRAINING MODEL INDOBERT

PERINGATAN: Training akan memakan waktu 15-30 menit!
Silakan tunggu hingga selesai...



Epoch,Training Loss,Validation Loss
1,0.5349,0.478249
2,0.4253,0.305997
3,0.3137,0.260053
4,0.1839,0.203931
5,0.1418,0.229814



Training IndoBERT selesai
Model tersimpan di folder 'indobert_title_model'


In [18]:
print("=" * 80)
print("TAHAP 14: EVALUASI MODEL INDOBERT")
print("=" * 80)

# Set ke mode evaluasi
model.eval()

# Pindahkan model ke device yang tepat
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"Model berada di device: {device}")

# Prediksi
print("\nMemulai prediksi...")
predictions = []

with torch.no_grad():
    for i in tqdm(range(0, len(X_test), 32)):
        batch_texts = X_test[i:i+32].tolist()
        
        inputs = tokenizer(
            batch_texts, 
            padding=True, 
            truncation=True,
            max_length=64, 
            return_tensors='pt'
        ).to(device)
        
        outputs = model(**inputs)
        batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(batch_preds)

y_pred_bert = np.array(predictions)

# Metrik
acc_bert = accuracy_score(y_test, y_pred_bert)
f1_bert = f1_score(y_test, y_pred_bert, average='weighted')

print("\nMETRIK PERFORMA:")
print(f"Accuracy:  {acc_bert:.4f}")
print(f"F1-Score:  {f1_bert:.4f}")

print("\nLAPORAN KLASIFIKASI:")
print(classification_report(y_test, y_pred_bert, target_names=['hoax', 'valid']))

print("\nMATRIKS KONFUSI:")
cm_bert = confusion_matrix(y_test, y_pred_bert)
print(cm_bert)


TAHAP 14: EVALUASI MODEL INDOBERT
Model berada di device: cpu

Memulai prediksi...


100%|██████████| 20/20 [00:16<00:00,  1.20it/s]



METRIK PERFORMA:
Accuracy:  0.9349
F1-Score:  0.9344

LAPORAN KLASIFIKASI:
              precision    recall  f1-score   support

        hoax       0.94      0.89      0.91       235
       valid       0.93      0.97      0.95       379

    accuracy                           0.93       614
   macro avg       0.94      0.93      0.93       614
weighted avg       0.94      0.93      0.93       614


MATRIKS KONFUSI:
[[208  27]
 [ 13 366]]


In [19]:
print("=" * 80)
print("TAHAP 15: PERBANDINGAN MODEL")
print("=" * 80)

# Buat tabel perbandingan
comparison = pd.DataFrame({
    'Model': ['TF-IDF + LogReg', 'IndoBERT'],
    'Accuracy': [acc_tfidf, acc_bert],
    'F1-Score (Weighted)': [f1_tfidf, f1_bert],
    'F1-Score (Hoax)': [
        f1_score(y_test, y_pred_tfidf, average=None)[0],
        f1_score(y_test, y_pred_bert, average=None)[0]
    ],
    'F1-Score (Valid)': [
        f1_score(y_test, y_pred_tfidf, average=None)[1],
        f1_score(y_test, y_pred_bert, average=None)[1]
    ]
})

print("\nPERBANDINGAN AKHIR:")
print(comparison.to_string(index=False))

# Model terbaik
best_model = comparison.loc[comparison['Accuracy'].idxmax(), 'Model']
print(f"\nMODEL TERBAIK: {best_model}")

# Simpan hasil
comparison.to_csv(r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\pseudo_labelling/perbandingan_model.csv', index=False)
print("\nHasil tersimpan di 'perbandingan_model.csv'")


TAHAP 15: PERBANDINGAN MODEL

PERBANDINGAN AKHIR:
          Model  Accuracy  F1-Score (Weighted)  F1-Score (Hoax)  F1-Score (Valid)
TF-IDF + LogReg  0.855049             0.855639         0.814969          0.880857
       IndoBERT  0.934853             0.934444         0.912281          0.948187

MODEL TERBAIK: IndoBERT

Hasil tersimpan di 'perbandingan_model.csv'
