In [1]:
import json
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("PSEUDO-LABELING DATA YOUTUBE DENGAN BALANCED STRATEGY")
print("="*80)
print("\nLibrary berhasil diimport")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA tersedia: {torch.cuda.is_available()}")
print(f"Device yang akan digunakan: {'GPU' if torch.cuda.is_available() else 'CPU'}")
print("\nSetup selesai. Siap untuk tahap berikutnya.")


PSEUDO-LABELING DATA YOUTUBE DENGAN BALANCED STRATEGY

Library berhasil diimport
PyTorch version: 2.1.0+cu118
CUDA tersedia: True
Device yang akan digunakan: GPU

Setup selesai. Siap untuk tahap berikutnya.


In [2]:
print("\n" + "="*80)
print("MEMUAT MODEL INDOBERT")
print("="*80)

# Path model (sesuaikan dengan lokasi Anda)
model_path = r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\pseudo_labelling\indobert_title_model'

print(f"\nMemuat model dari:")
print(f"  {model_path}")

# Load tokenizer dan model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

print(f"\nModel berhasil dimuat")
print(f"Device: {device}")
print(f"Jumlah parameter: {model.num_parameters():,}")
print(f"Status: Evaluation mode (siap untuk prediksi)")



MEMUAT MODEL INDOBERT

Memuat model dari:
  D:\INDONERIS-DATAMINING\multimodal-hoax-detection\models\pseudo_labelling\indobert_title_model

Model berhasil dimuat
Device: cuda
Jumlah parameter: 110,559,746
Status: Evaluation mode (siap untuk prediksi)


In [3]:
print("\n" + "="*80)
print("MEMUAT DATA YOUTUBE")
print("="*80)

# Path file JSON (sesuaikan dengan lokasi Anda)
json_path = r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\data\normalization\metadata_yt.json'

print(f"\nMembaca file:")
print(f"  {json_path}")

# Baca file JSON
with open(json_path, 'r', encoding='utf-8') as f:
    data_yt = json.load(f)

print(f"\nTotal data YouTube: {len(data_yt):,} video")

# Konversi ke DataFrame
df_yt = pd.DataFrame(data_yt)

print(f"\nInfo dataset:")
print(f"  Total records: {len(df_yt):,}")
print(f"  Total kolom: {len(df_yt.columns)}")

print(f"\nKolom yang tersedia:")
for i, col in enumerate(df_yt.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\nContoh data (5 baris pertama):")
print(df_yt[['sample_id', 'title', 'channel', 'label']].head())



MEMUAT DATA YOUTUBE

Membaca file:
  D:\INDONERIS-DATAMINING\multimodal-hoax-detection\data\normalization\metadata_yt.json

Total data YouTube: 992 video

Info dataset:
  Total records: 992
  Total kolom: 25

Kolom yang tersedia:
   1. sample_id
   2. source
   3. keyword
   4. video_id
   5. title
   6. channel
   7. duration
   8. upload_date
   9. url
  10. audio_path
  11. thumbnail_path
  12. transcript_path
  13. transcript_text
  14. transcript_length
  15. language
  16. label
  17. status
  18. transcribed_at
  19. keyword_tier
  20. channel_type
  21. cleaned_text
  22. normalized_text
  23. token_count
  24. quality_status
  25. repetition_flag

Contoh data (5 baris pertama):
  sample_id                                              title     channel  \
0  YT_00002  [FULL] Wapres Gibran Umumkan Kado Istimewa dar...    KOMPASTV   
1  YT_00003  Live Event Rekam Jejak Prabowo Menjadi Preside...   METRO TV    
2  YT_00004  Ulas Utas, Setahun Presiden Prabowo: 'Koruptor...  tvOne

In [4]:
print("\n" + "="*80)
print("FILTERING DATA BERKUALITAS")
print("="*80)

print(f"\nData awal: {len(df_yt):,} video")

# Tampilkan distribusi quality status
print(f"\nDistribusi Quality Status:")
print(df_yt['quality_status'].value_counts())

# Tampilkan statistik token count
print(f"\nStatistik Token Count:")
print(df_yt['token_count'].describe())

# Kriteria filtering
print(f"\n" + "-"*80)
print("KRITERIA FILTERING:")
print("-"*80)
print("1. Title tidak kosong")
print("2. Normalized text tersedia")
print("3. Token count minimal 3")
print("4. Quality status = GOOD")

# Apply filter
df_valid = df_yt[
    (df_yt['title'].notna()) &
    (df_yt['normalized_text'].notna()) &
    (df_yt['token_count'] >= 3) &
    (df_yt['quality_status'] == 'GOOD')
].copy()

# Hasil filtering
print(f"\n" + "-"*80)
print("HASIL FILTERING:")
print("-"*80)
print(f"Data awal: {len(df_yt):,} video")
print(f"Data valid: {len(df_valid):,} video")
print(f"Data difilter: {len(df_yt) - len(df_valid):,} video")
print(f"Persentase lolos: {len(df_valid)/len(df_yt)*100:.1f}%")

# Statistik data valid
print(f"\nStatistik Data Valid:")
print(f"  Rata-rata token: {df_valid['token_count'].mean():.1f}")
print(f"  Median token: {df_valid['token_count'].median():.1f}")
print(f"  Min token: {df_valid['token_count'].min()}")
print(f"  Max token: {df_valid['token_count'].max()}")

# Top 10 channel
print(f"\nTop 10 Channel:")
top_channels = df_valid['channel'].value_counts().head(10)
for channel, count in top_channels.items():
    print(f"  {channel}: {count} video")



FILTERING DATA BERKUALITAS

Data awal: 992 video

Distribusi Quality Status:
quality_status
GOOD    992
Name: count, dtype: int64

Statistik Token Count:
count     992.000000
mean      476.735887
std       287.127861
min        10.000000
25%       267.000000
50%       385.500000
75%       631.000000
max      1739.000000
Name: token_count, dtype: float64

--------------------------------------------------------------------------------
KRITERIA FILTERING:
--------------------------------------------------------------------------------
1. Title tidak kosong
2. Normalized text tersedia
3. Token count minimal 3
4. Quality status = GOOD



--------------------------------------------------------------------------------
HASIL FILTERING:
--------------------------------------------------------------------------------
Data awal: 992 video
Data valid: 992 video
Data difilter: 0 video
Persentase lolos: 100.0%

Statistik Data Valid:
  Rata-rata token: 476.7
  Median token: 385.5
  Min token: 10
  Max token: 1739

Top 10 Channel:
  METRO TV : 100 video
  KOMPASTV: 88 video
  tvOneNews : 84 video
  Kompas.com: 63 video
  Tribunnews: 57 video
  Official iNews: 57 video
  CNBC Indonesia: 25 video
  CNN Indonesia: 23 video
  BeritaSatu: 17 video
  Tribun MedanTV: 15 video


In [5]:
print("\n" + "="*80)
print("DEFINISI FUNGSI PREDIKSI")
print("="*80)

def prediksi_batch(titles, batch_size=8, max_length=32):
    """
    Fungsi untuk memprediksi label batch titles menggunakan IndoBERT
    
    Parameter:
        titles: List of title strings
        batch_size: Ukuran batch untuk inference (default 8)
        max_length: Panjang maksimal token (default 32)
    
    Return:
        predictions: Array prediksi (0=hoax, 1=valid)
        confidences: Array confidence scores
        probabilities: Array probabilitas [prob_hoax, prob_valid]
    """
    all_predictions = []
    all_confidences = []
    all_probabilities = []
    
    print(f"\nParameter prediksi:")
    print(f"  Total data: {len(titles):,}")
    print(f"  Batch size: {batch_size}")
    print(f"  Max length: {max_length}")
    print(f"  Device: {device}")
    
    print(f"\nMemulai prediksi...")
    
    # Proses batch dengan progress bar
    for i in tqdm(range(0, len(titles), batch_size), desc="Progress"):
        batch_titles = titles[i:i+batch_size]
        
        # Tokenisasi
        inputs = tokenizer(
            batch_titles,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        ).to(device)
        
        # Prediksi tanpa gradient
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
        
        # Ambil confidence dan prediksi
        confidences, predictions = torch.max(probs, dim=1)
        
        # Simpan hasil
        all_predictions.extend(predictions.cpu().numpy())
        all_confidences.extend(confidences.cpu().numpy())
        all_probabilities.extend(probs.cpu().numpy())
    
    return (
        np.array(all_predictions),
        np.array(all_confidences),
        np.array(all_probabilities)
    )

print("\nFungsi prediksi_batch berhasil didefinisikan")
print("\nSpesifikasi fungsi:")
print("  Input: List of titles")
print("  Output: (predictions, confidences, probabilities)")
print("  Label mapping: 0 = Hoax, 1 = Valid")



DEFINISI FUNGSI PREDIKSI

Fungsi prediksi_batch berhasil didefinisikan

Spesifikasi fungsi:
  Input: List of titles
  Output: (predictions, confidences, probabilities)
  Label mapping: 0 = Hoax, 1 = Valid


In [6]:
print("\n" + "="*80)
print("PROSES PSEUDO-LABELING")
print("="*80)

# Gunakan normalized_text untuk prediksi
titles_to_label = df_valid['normalized_text'].tolist()

print(f"\nTotal title yang akan diprediksi: {len(titles_to_label):,}")
print(f"Estimasi waktu: sekitar {len(titles_to_label)/8*0.5:.0f} detik")

# Jalankan prediksi
predictions, confidences, probabilities = prediksi_batch(
    titles_to_label,
    batch_size=8,
    max_length=32
)

# Tambahkan hasil ke dataframe
df_valid['predicted_label'] = predictions
df_valid['confidence'] = confidences
df_valid['prob_hoax'] = probabilities[:, 0]
df_valid['prob_valid'] = probabilities[:, 1]

# Konversi ke string label
df_valid['predicted_label_str'] = df_valid['predicted_label'].map({
    0: 'hoax',
    1: 'valid'
})

print("\n" + "="*80)
print("HASIL PSEUDO-LABELING")
print("="*80)

print(f"\nPseudo-labeling selesai")
print(f"Total data yang dilabeli: {len(df_valid):,}")

# Distribusi prediksi
print(f"\n" + "-"*80)
print("DISTRIBUSI PREDIKSI:")
print("-"*80)
dist = df_valid['predicted_label_str'].value_counts()
print(dist)

if 'hoax' in dist.index and 'valid' in dist.index:
    ratio = dist['valid'] / dist['hoax']
    print(f"\nRatio Valid:Hoax = {ratio:.1f}:1")

# Statistik confidence
print(f"\n" + "-"*80)
print("STATISTIK CONFIDENCE:")
print("-"*80)
print(f"  Rata-rata: {df_valid['confidence'].mean():.4f}")
print(f"  Median: {df_valid['confidence'].median():.4f}")
print(f"  Min: {df_valid['confidence'].min():.4f}")
print(f"  Max: {df_valid['confidence'].max():.4f}")
print(f"  Std Dev: {df_valid['confidence'].std():.4f}")

print(f"\nQuartiles:")
print(df_valid['confidence'].describe()[['25%', '50%', '75%']])



PROSES PSEUDO-LABELING

Total title yang akan diprediksi: 992
Estimasi waktu: sekitar 62 detik

Parameter prediksi:
  Total data: 992
  Batch size: 8
  Max length: 32
  Device: cuda

Memulai prediksi...


Progress: 100%|██████████| 124/124 [00:04<00:00, 24.84it/s]


HASIL PSEUDO-LABELING

Pseudo-labeling selesai
Total data yang dilabeli: 992

--------------------------------------------------------------------------------
DISTRIBUSI PREDIKSI:
--------------------------------------------------------------------------------
predicted_label_str
valid    971
hoax      21
Name: count, dtype: int64

Ratio Valid:Hoax = 46.2:1

--------------------------------------------------------------------------------
STATISTIK CONFIDENCE:
--------------------------------------------------------------------------------
  Rata-rata: 0.9267
  Median: 0.9594
  Min: 0.5027
  Max: 0.9949
  Std Dev: 0.0884

Quartiles:
25%    0.920848
50%    0.959390
75%    0.975943
Name: confidence, dtype: float64





In [7]:
print("\n" + "="*80)
print("ANALISIS DETAIL HASIL PREDIKSI")
print("="*80)

# BAGIAN 1: Analisis hoax yang terdeteksi
print("\n" + "="*80)
print("BAGIAN 1: VIDEO YANG DIPREDIKSI HOAX")
print("="*80)

df_hoax_detected = df_valid[df_valid['predicted_label_str'] == 'hoax'].copy()
df_hoax_detected = df_hoax_detected.sort_values('confidence', ascending=False)

print(f"\nTotal hoax terdeteksi: {len(df_hoax_detected)}")
print(f"\nRincian {min(10, len(df_hoax_detected))} hoax teratas:")

for idx, (i, row) in enumerate(df_hoax_detected.head(10).iterrows(), 1):
    print(f"\n{idx}. [{row['sample_id']}]")
    print(f"   Title: {row['title'][:100]}...")
    print(f"   Channel: {row['channel']}")
    print(f"   Confidence: {row['confidence']:.4f} ({row['confidence']*100:.2f}%)")
    print(f"   Prob [Hoax|Valid]: [{row['prob_hoax']:.4f}|{row['prob_valid']:.4f}]")

# BAGIAN 2: Valid dengan confidence rendah (borderline)
print("\n" + "="*80)
print("BAGIAN 2: VALID DENGAN CONFIDENCE RENDAH (Borderline Cases)")
print("="*80)

# Hitung berapa banyak di tiap range
n_below_70 = (df_valid['confidence'] < 0.70).sum()
n_below_75 = (df_valid['confidence'] < 0.75).sum()
n_below_80 = (df_valid['confidence'] < 0.80).sum()
n_below_90 = (df_valid['confidence'] < 0.90).sum()

print(f"\nDistribusi berdasarkan threshold confidence:")
print(f"  Confidence < 0.70: {n_below_70:,} samples")
print(f"  Confidence < 0.75: {n_below_75:,} samples")
print(f"  Confidence < 0.80: {n_below_80:,} samples")
print(f"  Confidence < 0.90: {n_below_90:,} samples")

# Tampilkan top 15 borderline
df_borderline = df_valid[
    df_valid['predicted_label_str'] == 'valid'
].nsmallest(15, 'confidence')

print(f"\n15 Valid dengan Confidence Terendah (Potensi untuk Reclassify):")
for idx, (i, row) in enumerate(df_borderline.iterrows(), 1):
    print(f"\n{idx}. [{row['sample_id']}] Conf: {row['confidence']:.4f}")
    print(f"   Title: {row['title'][:80]}...")
    print(f"   Channel: {row['channel']}")
    print(f"   Prob [Hoax|Valid]: [{row['prob_hoax']:.4f}|{row['prob_valid']:.4f}]")

# BAGIAN 3: Distribusi confidence per range
print("\n" + "="*80)
print("BAGIAN 3: DISTRIBUSI CONFIDENCE PER RANGE")
print("="*80)

bins = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1.0]
labels = ['0.50-0.60', '0.60-0.70', '0.70-0.80', '0.80-0.90', '0.90-0.95', '0.95-1.00']

df_valid['conf_range'] = pd.cut(df_valid['confidence'], bins=bins, labels=labels)

print("\nDistribusi per Label dan Confidence Range:")
dist_table = df_valid.groupby(['predicted_label_str', 'conf_range']).size().unstack(fill_value=0)
print(dist_table)

# BAGIAN 4: Analisis per channel
print("\n" + "="*80)
print("BAGIAN 4: ANALISIS PER CHANNEL")
print("="*80)

channel_stats = df_valid.groupby('channel').agg({
    'confidence': ['mean', 'std', 'count'],
    'predicted_label_str': lambda x: (x == 'hoax').sum()
}).round(4)

channel_stats.columns = ['avg_conf', 'std_conf', 'total_video', 'hoax_count']
channel_stats = channel_stats.sort_values('hoax_count', ascending=False)

print("\nTop 10 Channel dengan Hoax Terbanyak:")
print(channel_stats.head(10))

print("\nTop 10 Channel dengan Confidence Terendah:")
channel_low_conf = df_valid.groupby('channel').agg({
    'confidence': 'mean'
}).sort_values('confidence', ascending=True)
print(channel_low_conf.head(10))



ANALISIS DETAIL HASIL PREDIKSI

BAGIAN 1: VIDEO YANG DIPREDIKSI HOAX

Total hoax terdeteksi: 21

Rincian 10 hoax teratas:

1. [YT_00081]
   Title: Publik Geram, Hukum Tumpul di Kasus Silfester? - [Selamat Pagi Indonesia]...
   Channel: METRO TV 
   Confidence: 0.9056 (90.56%)
   Prob [Hoax|Valid]: [0.9056|0.0944]

2. [YT_00867]
   Title: Wakil Presiden Gibran Rakabuming Raka Digugat Rp125 Triliun, Jabatan Wapres Diminta Dibatalkan...
   Channel: Banjarmasin Post News Video
   Confidence: 0.8510 (85.10%)
   Prob [Hoax|Valid]: [0.8510|0.1490]

3. [YT_00904]
   Title: Peran dan Ideologi Partai Politik di Indonesia | UAS Sistem Politik Indonesia...
   Channel: scientialoquendi
   Confidence: 0.7721 (77.21%)
   Prob [Hoax|Valid]: [0.7721|0.2279]

4. [YT_01161]
   Title: [BREAKING NEWS] Sederet Nama yang Dilantik Menjadi Komisi Percepatan Reformasi Polri | tvOne...
   Channel: tvOneNews 
   Confidence: 0.7557 (75.57%)
   Prob [Hoax|Valid]: [0.7557|0.2443]

5. [YT_00039]
   Title: Apakah Ben

In [8]:
print("\n" + "="*80)
print("STRATEGI BALANCING DENGAN RECLASSIFICATION")
print("="*80)

# KONFIGURASI BALANCING
HOAX_THRESHOLD = 0.70        # Borderline valid dengan conf < 0.70 → reclassify jadi hoax
VALID_THRESHOLD = 0.90       # Valid dengan conf >= 0.90 untuk high-quality samples
TARGET_RATIO = 3.0           # Target ratio Valid:Hoax
MAX_HOAX_SAMPLES = 80        # Maksimal hoax samples yang diinginkan
TOTAL_BUDGET = 300           # Total maksimal pseudo-labels

print("\nKONFIGURASI:")
print(f"  Hoax threshold (reclassify borderline): < {HOAX_THRESHOLD}")
print(f"  Valid threshold (high-confidence): >= {VALID_THRESHOLD}")
print(f"  Target ratio Valid:Hoax: {TARGET_RATIO}:1")
print(f"  Max hoax samples: {MAX_HOAX_SAMPLES}")
print(f"  Total budget: {TOTAL_BUDGET}")

# TAHAP 1: Ambil hoax yang sudah terdeteksi
print("\n" + "="*80)
print("TAHAP 1: HOAX YANG SUDAH TERDETEKSI")
print("="*80)

df_hoax_original = df_valid[df_valid['predicted_label_str'] == 'hoax'].copy()
df_hoax_original['source_type'] = 'predicted_hoax'

print(f"\nHoax terdeteksi langsung: {len(df_hoax_original)}")
print(f"Confidence range: {df_hoax_original['confidence'].min():.4f} - {df_hoax_original['confidence'].max():.4f}")

# TAHAP 2: Reclassify borderline valid sebagai hoax
print("\n" + "="*80)
print("TAHAP 2: RECLASSIFY BORDERLINE VALID")
print("="*80)

df_borderline = df_valid[
    (df_valid['predicted_label_str'] == 'valid') &
    (df_valid['confidence'] < HOAX_THRESHOLD)
].copy()

# Sort by confidence ascending (yang paling uncertain dulu)
df_borderline = df_borderline.sort_values('confidence', ascending=True)

print(f"\nTotal borderline valid (conf < {HOAX_THRESHOLD}): {len(df_borderline)}")

# Limit agar tidak melebihi MAX_HOAX_SAMPLES
n_borderline_needed = min(
    MAX_HOAX_SAMPLES - len(df_hoax_original),
    len(df_borderline)
)

df_borderline_selected = df_borderline.head(n_borderline_needed)
df_borderline_selected['source_type'] = 'reclassified'
df_borderline_selected['predicted_label_str'] = 'hoax'

print(f"Borderline yang di-reclassify: {len(df_borderline_selected)}")
print(f"Confidence range reclassified: {df_borderline_selected['confidence'].min():.4f} - {df_borderline_selected['confidence'].max():.4f}")

# TAHAP 3: Gabungkan semua hoax
print("\n" + "="*80)
print("TAHAP 3: GABUNGKAN HOAX")
print("="*80)

df_hoax_final = pd.concat([df_hoax_original, df_borderline_selected])

print(f"\nTotal hoax final:")
print(f"  Predicted hoax: {len(df_hoax_original)}")
print(f"  Reclassified: {len(df_borderline_selected)}")
print(f"  Total: {len(df_hoax_final)}")

# TAHAP 4: Pilih valid dengan high-confidence
print("\n" + "="*80)
print("TAHAP 4: SELEKSI VALID HIGH-CONFIDENCE")
print("="*80)

df_valid_highconf = df_valid[
    (df_valid['predicted_label_str'] == 'valid') &
    (df_valid['confidence'] >= VALID_THRESHOLD)
].copy()

# Sort by confidence descending
df_valid_highconf = df_valid_highconf.sort_values('confidence', ascending=False)

# Hitung berapa valid yang dibutuhkan untuk target ratio
n_valid_needed = int(len(df_hoax_final) * TARGET_RATIO)
n_valid_needed = min(n_valid_needed, len(df_valid_highconf))

# Adjust jika melebihi budget
if len(df_hoax_final) + n_valid_needed > TOTAL_BUDGET:
    n_valid_needed = TOTAL_BUDGET - len(df_hoax_final)

df_valid_selected = df_valid_highconf.head(n_valid_needed)
df_valid_selected['source_type'] = 'high_confidence'

print(f"\nValid tersedia (conf >= {VALID_THRESHOLD}): {len(df_valid_highconf)}")
print(f"Valid dipilih: {n_valid_needed}")
print(f"Confidence range: {df_valid_selected['confidence'].min():.4f} - {df_valid_selected['confidence'].max():.4f}")

# TAHAP 5: Gabungkan hasil final
print("\n" + "="*80)
print("TAHAP 5: HASIL FINAL BALANCED DATASET")
print("="*80)

df_balanced = pd.concat([df_hoax_final, df_valid_selected])

# Statistik final
dist_final = df_balanced['predicted_label_str'].value_counts()
print(f"\nDistribusi Final:")
print(dist_final)

if 'hoax' in dist_final.index and 'valid' in dist_final.index:
    final_ratio = dist_final['valid'] / dist_final['hoax']
    print(f"\nRatio Final Valid:Hoax: {final_ratio:.2f}:1")

print(f"Total Pseudo-Labels: {len(df_balanced)}")

# Statistik per class
print("\n" + "="*80)
print("STATISTIK CONFIDENCE PER CLASS")
print("="*80)

for label in ['hoax', 'valid']:
    subset = df_balanced[df_balanced['predicted_label_str'] == label]
    if len(subset) > 0:
        print(f"\n{label.upper()}:")
        print(f"  Count: {len(subset)}")
        print(f"  Mean confidence: {subset['confidence'].mean():.4f}")
        print(f"  Median confidence: {subset['confidence'].median():.4f}")
        print(f"  Min confidence: {subset['confidence'].min():.4f}")
        print(f"  Max confidence: {subset['confidence'].max():.4f}")
        print(f"  Std Dev: {subset['confidence'].std():.4f}")

# Info source type
print("\n" + "="*80)
print("BREAKDOWN PER SOURCE TYPE")
print("="*80)

source_breakdown = df_balanced.groupby(['predicted_label_str', 'source_type']).size().unstack(fill_value=0)
print(source_breakdown)

n_reclassified = (df_balanced['source_type'] == 'reclassified').sum()
pct_reclassified = n_reclassified / len(df_balanced) * 100

print(f"\nTotal borderline cases yang di-reclassify: {n_reclassified} ({pct_reclassified:.1f}%)")



STRATEGI BALANCING DENGAN RECLASSIFICATION

KONFIGURASI:
  Hoax threshold (reclassify borderline): < 0.7
  Valid threshold (high-confidence): >= 0.9
  Target ratio Valid:Hoax: 3.0:1
  Max hoax samples: 80
  Total budget: 300

TAHAP 1: HOAX YANG SUDAH TERDETEKSI

Hoax terdeteksi langsung: 21
Confidence range: 0.5027 - 0.9056

TAHAP 2: RECLASSIFY BORDERLINE VALID

Total borderline valid (conf < 0.7): 31
Borderline yang di-reclassify: 31
Confidence range reclassified: 0.5100 - 0.6999

TAHAP 3: GABUNGKAN HOAX

Total hoax final:
  Predicted hoax: 21
  Reclassified: 31
  Total: 52

TAHAP 4: SELEKSI VALID HIGH-CONFIDENCE

Valid tersedia (conf >= 0.9): 797
Valid dipilih: 156
Confidence range: 0.9813 - 0.9949

TAHAP 5: HASIL FINAL BALANCED DATASET

Distribusi Final:
predicted_label_str
valid    156
hoax      52
Name: count, dtype: int64

Ratio Final Valid:Hoax: 3.00:1
Total Pseudo-Labels: 208

STATISTIK CONFIDENCE PER CLASS

HOAX:
  Count: 52
  Mean confidence: 0.6278
  Median confidence: 0.62

In [9]:
print("\n" + "="*80)
print("ANALISIS DISTRIBUSI CONFIDENCE")
print("="*80)

# Buat confidence bins
bins = [0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 1.0]
labels_bins = ['0.50-0.60', '0.60-0.70', '0.70-0.80', '0.80-0.90', '0.90-0.95', '0.95-1.00']

df_balanced['conf_bin'] = pd.cut(
    df_balanced['confidence'],
    bins=bins,
    labels=labels_bins
)

# Distribusi per class
print("\nDistribusi Confidence per Class:")

for label in ['hoax', 'valid']:
    subset = df_balanced[df_balanced['predicted_label_str'] == label]
    if len(subset) > 0:
        print(f"\n{label.upper()}:")
        bin_dist = subset['conf_bin'].value_counts().sort_index()
        for bin_name, count in bin_dist.items():
            pct = count / len(subset) * 100
            print(f"  {bin_name}: {count:3d} ({pct:5.1f}%)")

# Cross-table
print("\n" + "="*80)
print("CROSS-TABLE DISTRIBUSI")
print("="*80)

cross_tab = pd.crosstab(
    df_balanced['predicted_label_str'],
    df_balanced['conf_bin'],
    margins=True
)
print(cross_tab)

# Breakdown hoax per source type dan confidence range
print("\n" + "="*80)
print("BREAKDOWN HOAX PER CONFIDENCE RANGE DAN SOURCE")
print("="*80)

hoax_subset = df_balanced[df_balanced['predicted_label_str'] == 'hoax']

for bin_name in labels_bins:
    bin_data = hoax_subset[hoax_subset['conf_bin'] == bin_name]
    if len(bin_data) > 0:
        n_predicted = (bin_data['source_type'] == 'predicted_hoax').sum()
        n_reclassified = (bin_data['source_type'] == 'reclassified').sum()
        
        print(f"\n{bin_name}: {len(bin_data)} samples")
        print(f"  Predicted hoax: {n_predicted}")
        print(f"  Reclassified: {n_reclassified}")

# Channel distribution untuk hoax
print("\n" + "="*80)
print("CHANNEL DISTRIBUTION UNTUK HOAX")
print("="*80)

hoax_channels = hoax_subset['channel'].value_counts().head(15)
print("\nTop 15 Channel dengan Hoax Terbanyak:")
for idx, (channel, count) in enumerate(hoax_channels.items(), 1):
    print(f"  {idx:2d}. {channel}: {count} hoax")

# Channel distribution untuk valid
print("\n" + "="*80)
print("CHANNEL DISTRIBUTION UNTUK VALID")
print("="*80)

valid_subset = df_balanced[df_balanced['predicted_label_str'] == 'valid']
valid_channels = valid_subset['channel'].value_counts().head(10)

print("\nTop 10 Channel Valid:")
for idx, (channel, count) in enumerate(valid_channels.items(), 1):
    print(f"  {idx:2d}. {channel}: {count} valid")



ANALISIS DISTRIBUSI CONFIDENCE

Distribusi Confidence per Class:

HOAX:
  0.50-0.60:  22 ( 42.3%)
  0.60-0.70:  22 ( 42.3%)
  0.70-0.80:   6 ( 11.5%)
  0.80-0.90:   1 (  1.9%)
  0.90-0.95:   1 (  1.9%)
  0.95-1.00:   0 (  0.0%)

VALID:
  0.50-0.60:   0 (  0.0%)
  0.60-0.70:   0 (  0.0%)
  0.70-0.80:   0 (  0.0%)
  0.80-0.90:   0 (  0.0%)
  0.90-0.95:   0 (  0.0%)
  0.95-1.00: 156 (100.0%)

CROSS-TABLE DISTRIBUSI
conf_bin             0.50-0.60  0.60-0.70  0.70-0.80  0.80-0.90  0.90-0.95  \
predicted_label_str                                                          
hoax                        22         22          6          1          1   
valid                        0          0          0          0          0   
All                         22         22          6          1          1   

conf_bin             0.95-1.00  All  
predicted_label_str                  
hoax                         0   52  
valid                      156  156  
All                        156  208  

B

In [10]:
print("\n" + "="*80)
print("REVIEW SAMPLE PSEUDO-LABELS")
print("="*80)

# BAGIAN 1: Sample hoax dari berbagai range
print("\n" + "="*80)
print("BAGIAN 1: SAMPLE HOAX (10 samples dari berbagai range)")
print("="*80)

hoax_samples = df_balanced[df_balanced['predicted_label_str'] == 'hoax'].copy()

# Ambil sample dari berbagai confidence range
sample_low = hoax_samples[hoax_samples['conf_bin'] == '0.50-0.60'].head(3)
sample_mid = hoax_samples[hoax_samples['conf_bin'] == '0.60-0.70'].head(4)
sample_high = hoax_samples[hoax_samples['conf_bin'] == '0.70-0.80'].head(3)

hoax_review = pd.concat([sample_low, sample_mid, sample_high])

for idx, (i, row) in enumerate(hoax_review.iterrows(), 1):
    source_mark = "[RECLASSIFIED]" if row['source_type'] == 'reclassified' else "[PREDICTED]"
    
    print(f"\n{idx}. {source_mark} [{row['sample_id']}]")
    print(f"   Title: {row['title'][:90]}...")
    print(f"   Channel: {row['channel']}")
    print(f"   Label: HOAX | Confidence: {row['confidence']:.4f} ({row['confidence']*100:.2f}%)")
    print(f"   Prob [Hoax|Valid]: [{row['prob_hoax']:.4f}|{row['prob_valid']:.4f}]")

# BAGIAN 2: Sample valid high-confidence
print("\n" + "="*80)
print("BAGIAN 2: SAMPLE VALID HIGH-CONFIDENCE (5 samples)")
print("="*80)

valid_samples = df_balanced[
    df_balanced['predicted_label_str'] == 'valid'
].nlargest(5, 'confidence')

for idx, (i, row) in enumerate(valid_samples.iterrows(), 1):
    print(f"\n{idx}. [{row['sample_id']}]")
    print(f"   Title: {row['title'][:90]}...")
    print(f"   Channel: {row['channel']}")
    print(f"   Label: VALID | Confidence: {row['confidence']:.4f} ({row['confidence']*100:.2f}%)")
    print(f"   Prob [Hoax|Valid]: [{row['prob_hoax']:.4f}|{row['prob_valid']:.4f}]")

# BAGIAN 3: Hoax dengan confidence terendah (butuh review)
print("\n" + "="*80)
print("BAGIAN 3: HOAX DENGAN CONFIDENCE TERENDAH (Butuh Manual Review)")
print("="*80)

low_conf_hoax = hoax_samples.nsmallest(5, 'confidence')

print("\n5 Hoax dengan confidence terendah (rekomendasi untuk manual review):")

for idx, (i, row) in enumerate(low_conf_hoax.iterrows(), 1):
    source_mark = "[RECLASSIFIED]" if row['source_type'] == 'reclassified' else "[PREDICTED]"
    
    print(f"\n{idx}. {source_mark} Confidence: {row['confidence']:.4f}")
    print(f"   ID: {row['sample_id']}")
    print(f"   Title: {row['title']}")
    print(f"   Channel: {row['channel']}")
    print(f"   Prob [Hoax|Valid]: [{row['prob_hoax']:.4f}|{row['prob_valid']:.4f}]")

# BAGIAN 4: Reclassified samples review
print("\n" + "="*80)
print("BAGIAN 4: SAMPLE RECLASSIFIED (5 contoh)")
print("="*80)

reclassified_samples = df_balanced[
    df_balanced['source_type'] == 'reclassified'
].head(5)

print("\n5 Borderline valid yang di-reclassify jadi hoax:")

for idx, (i, row) in enumerate(reclassified_samples.iterrows(), 1):
    print(f"\n{idx}. [{row['sample_id']}] Conf: {row['confidence']:.4f}")
    print(f"   Title: {row['title']}")
    print(f"   Channel: {row['channel']}")
    print(f"   Prob [Hoax|Valid]: [{row['prob_hoax']:.4f}|{row['prob_valid']:.4f}]")



REVIEW SAMPLE PSEUDO-LABELS

BAGIAN 1: SAMPLE HOAX (10 samples dari berbagai range)

1. [PREDICTED] [YT_00035]
   Title: Wapres Gibran Tunaikan Sholat Ied Idul Fitri dan Bersilaturahmi Dengan Presiden...
   Channel: Wakil Presiden Republik Indonesia
   Label: HOAX | Confidence: 0.5572 (55.72%)
   Prob [Hoax|Valid]: [0.5572|0.4428]

2. [PREDICTED] [YT_00261]
   Title: Respons China soal Pertemuan Trump dengan PM Jepang Sanae Takaichi...
   Channel: Kompas.com
   Label: HOAX | Confidence: 0.5227 (52.27%)
   Prob [Hoax|Valid]: [0.5227|0.4773]

3. [PREDICTED] [YT_00295]
   Title: Violence sweeps across Indonesia amid protests over worsening economy...
   Channel: PBS NewsHour
   Label: HOAX | Confidence: 0.5799 (57.99%)
   Prob [Hoax|Valid]: [0.5799|0.4201]

4. [PREDICTED] [YT_00011]
   Title: Mengapa Presiden Prabowo Selalu Pilih Kader Gerindra dalam Reshuffle Kabinet?...
   Channel: Kompas.com
   Label: HOAX | Confidence: 0.6828 (68.28%)
   Prob [Hoax|Valid]: [0.6828|0.3172]

5. [PREDIC

In [11]:
print("\n" + "="*80)
print("EXPORT HASIL PSEUDO-LABELING")
print("="*80)

# Pilih kolom penting untuk export
output_columns = [
    'sample_id', 'source', 'keyword', 'video_id',
    'title', 'channel', 'duration', 'upload_date',
    'url', 'audio_path', 'thumbnail_path',
    'transcript_path', 'transcript_text', 'transcript_length',
    'normalized_text', 'token_count',
    'predicted_label_str', 'confidence',
    'prob_hoax', 'prob_valid',
    'conf_bin', 'source_type'
]

df_export = df_balanced[output_columns].copy()

# Rename kolom untuk clarity
df_export = df_export.rename(columns={
    'predicted_label_str': 'pseudo_label',
    'confidence': 'pseudo_confidence',
    'conf_bin': 'confidence_range',
    'source_type': 'labeling_source'
})

# Sort by pseudo_label dan confidence
df_export = df_export.sort_values(
    ['pseudo_label', 'pseudo_confidence'],
    ascending=[True, False]
)

# Path output (sesuaikan dengan folder Anda)
output_dir = r'D:\INDONERIS-DATAMINING\multimodal-hoax-detection\data\training'
output_csv = f'{output_dir}/youtube_pseudo_labeled_balanced.csv'
output_json = f'{output_dir}/youtube_pseudo_labeled_balanced.json'

print(f"\nPath output:")
print(f"  Directory: {output_dir}")
print(f"  CSV: youtube_pseudo_labeled_balanced.csv")
print(f"  JSON: youtube_pseudo_labeled_balanced.json")

# Export ke CSV
df_export.to_csv(output_csv, index=False, encoding='utf-8')
print(f"\nData berhasil diekspor ke CSV")

# Export ke JSON
df_export.to_json(output_json, orient='records', indent=2, force_ascii=False)
print(f"Backup JSON berhasil dibuat")

# Summary export
print("\n" + "="*80)
print("SUMMARY EXPORT")
print("="*80)

print(f"\nTotal records: {len(df_export)}")

print(f"\nDistribusi pseudo-label:")
label_dist = df_export['pseudo_label'].value_counts()
for label, count in label_dist.items():
    pct = count / len(df_export) * 100
    print(f"  {label}: {count} ({pct:.1f}%)")

print(f"\nDistribusi confidence range:")
conf_dist = df_export['confidence_range'].value_counts().sort_index()
for range_name, count in conf_dist.items():
    print(f"  {range_name}: {count}")

print(f"\nDistribusi labeling source:")
source_dist = df_export['labeling_source'].value_counts()
for source, count in source_dist.items():
    pct = count / len(df_export) * 100
    print(f"  {source}: {count} ({pct:.1f}%)")

# Statistik confidence per label
print(f"\n" + "="*80)
print("STATISTIK CONFIDENCE PER LABEL")
print("="*80)

for label in ['hoax', 'valid']:
    subset = df_export[df_export['pseudo_label'] == label]
    if len(subset) > 0:
        print(f"\n{label.upper()}:")
        print(f"  Count: {len(subset)}")
        print(f"  Mean: {subset['pseudo_confidence'].mean():.4f}")
        print(f"  Median: {subset['pseudo_confidence'].median():.4f}")
        print(f"  Min: {subset['pseudo_confidence'].min():.4f}")
        print(f"  Max: {subset['pseudo_confidence'].max():.4f}")
        print(f"  Std Dev: {subset['pseudo_confidence'].std():.4f}")

# Final summary
print("\n" + "="*80)
print("PSEUDO-LABELING SELESAI")
print("="*80)

print(f"\nFile yang dihasilkan:")
print(f"  1. {output_csv}")
print(f"  2. {output_json}")

print(f"\nRINGKASAN AKHIR:")
print(f"  Total pseudo-labels: {len(df_export)}")
print(f"  Hoax: {(df_export['pseudo_label']=='hoax').sum()}")
print(f"  Valid: {(df_export['pseudo_label']=='valid').sum()}")

if 'hoax' in label_dist.index and 'valid' in label_dist.index:
    ratio = label_dist['valid'] / label_dist['hoax']
    print(f"  Ratio Valid:Hoax: {ratio:.2f}:1")

print(f"  Avg confidence (Hoax): {df_export[df_export['pseudo_label']=='hoax']['pseudo_confidence'].mean():.4f}")
print(f"  Avg confidence (Valid): {df_export[df_export['pseudo_label']=='valid']['pseudo_confidence'].mean():.4f}")

n_reclassified = (df_export['labeling_source'] == 'reclassified').sum()
print(f"  Borderline reclassified: {n_reclassified} ({n_reclassified/len(df_export)*100:.1f}%)")

print("\n" + "="*80)
print("REKOMENDASI NEXT STEPS")
print("="*80)

print("\n1. Manual Review")
print("   Review 5 hoax dengan confidence terendah untuk quality assurance")

print("\n2. Integrasi Data")
print("   Gabungkan dengan data TBH dan News (original labeled data)")

print("\n3. Training Strategy")
print("   - Gunakan curriculum learning (high-conf dulu, lalu semua data)")
print("   - Apply weighted loss berdasarkan pseudo_confidence")
print("   - Sample weight = confidence^2 untuk pseudo-labels")

print("\n4. Evaluation")
print("   - 5-fold stratified cross-validation")
print("   - Monitor F1-score untuk hoax class (minority)")
print("   - Tune threshold untuk optimal precision-recall")

print("\n5. Expected Performance")
print("   - Overall accuracy: 83-87%")
print("   - Hoax F1-score: 0.75-0.80")
print("   - ROC-AUC: 0.86-0.90")

print("\n" + "="*80)
print("PSEUDO-LABELING BERHASIL DISELESAIKAN")
print("="*80)



EXPORT HASIL PSEUDO-LABELING

Path output:
  Directory: D:\INDONERIS-DATAMINING\multimodal-hoax-detection\data\training
  CSV: youtube_pseudo_labeled_balanced.csv
  JSON: youtube_pseudo_labeled_balanced.json

Data berhasil diekspor ke CSV
Backup JSON berhasil dibuat

SUMMARY EXPORT

Total records: 208

Distribusi pseudo-label:
  valid: 156 (75.0%)
  hoax: 52 (25.0%)

Distribusi confidence range:
  0.50-0.60: 22
  0.60-0.70: 22
  0.70-0.80: 6
  0.80-0.90: 1
  0.90-0.95: 1
  0.95-1.00: 156

Distribusi labeling source:
  high_confidence: 156 (75.0%)
  reclassified: 31 (14.9%)
  predicted_hoax: 21 (10.1%)

STATISTIK CONFIDENCE PER LABEL

HOAX:
  Count: 52
  Mean: 0.6278
  Median: 0.6285
  Min: 0.5027
  Max: 0.9056
  Std Dev: 0.0859

VALID:
  Count: 156
  Mean: 0.9860
  Median: 0.9859
  Min: 0.9813
  Max: 0.9949
  Std Dev: 0.0030

PSEUDO-LABELING SELESAI

File yang dihasilkan:
  1. D:\INDONERIS-DATAMINING\multimodal-hoax-detection\data\training/youtube_pseudo_labeled_balanced.csv
  2. D:\I

In [12]:
print("\n" + "="*80)
print("PERSIAPAN DATA TRAINING (TANPA MANUAL REVIEW)")
print("="*80)

# Gunakan semua data dengan weighted loss
df_final = df_export.copy()

# Calculate sample weights (confidence^2)
df_final['sample_weight'] = df_final['pseudo_confidence'] ** 2

print(f"\nDataset final:")
print(f"  Total samples: {len(df_final)}")
print(f"  Hoax: {(df_final['pseudo_label']=='hoax').sum()}")
print(f"  Valid: {(df_final['pseudo_label']=='valid').sum()}")

print(f"\nSample weight statistics:")
print(f"  Min weight: {df_final['sample_weight'].min():.4f}")
print(f"  Max weight: {df_final['sample_weight'].max():.4f}")
print(f"  Mean weight: {df_final['sample_weight'].mean():.4f}")

print(f"\nHoax samples weight distribution:")
hoax_weights = df_final[df_final['pseudo_label']=='hoax']['sample_weight']
print(f"  Mean: {hoax_weights.mean():.4f}")
print(f"  Min: {hoax_weights.min():.4f}")
print(f"  Max: {hoax_weights.max():.4f}")

print(f"\nValid samples weight distribution:")
valid_weights = df_final[df_final['pseudo_label']=='valid']['sample_weight']
print(f"  Mean: {valid_weights.mean():.4f}")
print(f"  Min: {valid_weights.min():.4f}")
print(f"  Max: {valid_weights.max():.4f}")

print("\n" + "="*80)
print("DATASET SIAP UNTUK TRAINING")
print("="*80)
print("\nNote:")
print("  - Low-confidence samples (0.50-0.60) punya weight rendah (0.25-0.36)")
print("  - High-confidence samples (0.98-0.99) punya weight tinggi (0.96-0.98)")
print("  - Model otomatis belajar lebih dari high-confidence samples")
print("  - Noise dari borderline cases diminimalkan")



PERSIAPAN DATA TRAINING (TANPA MANUAL REVIEW)

Dataset final:
  Total samples: 208
  Hoax: 52
  Valid: 156

Sample weight statistics:
  Min weight: 0.2527
  Max weight: 0.9899
  Mean weight: 0.8294

Hoax samples weight distribution:
  Mean: 0.4014
  Min: 0.2527
  Max: 0.8202

Valid samples weight distribution:
  Mean: 0.9721
  Min: 0.9630
  Max: 0.9899

DATASET SIAP UNTUK TRAINING

Note:
  - Low-confidence samples (0.50-0.60) punya weight rendah (0.25-0.36)
  - High-confidence samples (0.98-0.99) punya weight tinggi (0.96-0.98)
  - Model otomatis belajar lebih dari high-confidence samples
  - Noise dari borderline cases diminimalkan
