In [3]:
import pandas as pd
import re
import numpy as np

# ============== 1. DEFINISIKAN FILTER ==============

# Filter 1: Daftar Hitam Anti-Bocor
LEAKAGE_BLACKLIST = [
    r"hasil periksa fakta", r"periksa fakta", r"faktanya",
    r"penjelasan:", r"referensi:", r"klaim:", r"narasi:",
    r"anggota komisariat mafindo", r"relawan mafindo",
    r"universitas sumatera utara", r"uin raden mas said",
    r"universitas diponegoro", r"universitas indonesia",
    r"institut ilmu sosial", r"universitas muhammadiyah",
    r"universitas pendidikan indonesia",
    r"informasi tersebut salah", r"merupakan konten",
    r"tidak benar", "informasi palsu", r"fakta bit.ly",
    r"hasil penelusuran", r"cuitan akun", r"faktanya judul",
    r"faktanya, unhcr", r"klaim sri sultan", r"informasi menyesatkan",
    r"tidak ditemukan pemberitaan", r"video tersebut merupakan",
    r"selengkapnya di bagian penjelasan", r"kategori:", r"\[kategori\]",
    r"konten yang dimanipulasi", r"konten yang menyesatkan"
]

def is_text_clean(text: str) -> bool:
    """Mengembalikan True jika BERSIH, False jika BOCOR."""
    if not isinstance(text, str): return False
    text_lower = text.lower()
    if len(text.split()) < 3: return False
    for pattern in LEAKAGE_BLACKLIST:
        if re.search(pattern, text_lower):
            return False # Ditemukan kebocoran, TOLAK
    return True # Aman, tidak ada kebocoran

# Filter 2: Daftar Putih Politik (Ketat)
POLITICAL_KEYWORDS = [
    "prabowo", "subianto", "presiden", "menteri", "pemerintah", "politik", 
    "pemilu", "pilpres", "capres", "pilkada", "partai", "dpr", "mpr",
    "parlemen", "legislatif", "eksekutif", "yudikatif",
    "kebijakan", "program", "anggaran", "apbn", "pajak", "ekonomi",
    "investasi", "pembangunan", "infrastruktur", "proyek", "kementerian",
    "kebijakan publik", "regulasi", "undang-undang", "uu",
    "indonesia", "nasional", "jakarta", "ibu kota", "ikn", "nusantara",
    "papua", "aceh", "kalimantan", "perbatasan", "natuna",
    "jokowi", "joko widodo", "ganjar", "ganjar pranowo", "anies", 
    "anies baswedan", "megawati", "hasto", "sri mulyani",
    "gerindra", "pdip", "pdi perjuangan", "golkar", "demokrat",
    "pkb", "pan", "nasdem", "psi", "perindo", "ppp",
    "bansos", "bantuan sosial", "subsidi", "utang", "defisit",
    "rapat", "sidang", "pertemuan", "kunjungan", "pidato", "pernyataan",
    "ruu", "perampasan aset", "habib", "riz_ieq", "fpi", "baâ€™alawi"
]

political_regex = r'\b(' + '|'.join(re.escape(k) for k in POLITICAL_KEYWORDS) + r')\b'

def is_political(full_text: str) -> bool:
    """Mengembalikan True jika mengandung keyword politik."""
    if not isinstance(full_text, str): return False
    if re.search(political_regex, full_text.lower()):
        return True
    return False

# ============== 2. PROSES FILE YANG DIUPLOAD ==============

file_path = "../data/raw/turnbackhoax/metadata/tbh_complete_dataset1.csv"
output_path = "../data/raw/turnbackhoax/metadata/tbh_BERSIH_POLITIK_SAJA.csv"

print(f"Memuat file: {file_path}")
try:
    df = pd.read_csv(file_path)
    print(f"Total baris dimuat: {len(df)}")
    
    # 3. Terapkan Filter Anti-Bocor
    # Asumsi teks ada di kolom 'post_text' dari skrip Anda sebelumnya
    if 'post_text' in df.columns:
        df['is_clean'] = df['post_text'].apply(is_text_clean)
        df_clean = df[df['is_clean'] == True]
        print(f"Data setelah filter anti-bocor: {len(df_clean)} baris")
        print(f"   (Dibuang {len(df) - len(df_clean)} baris bocor)")
    else:
        print("ERROR: Kolom 'post_text' tidak ditemukan. Menggunakan 'text'.")
        df['is_clean'] = df['text'].apply(is_text_clean)
        df_clean = df[df['is_clean'] == True]
        print(f"Data after anti-leak filter: {len(df_clean)} rows")

    # 4. Terapkan Filter Politik
    if 'blog_title' in df.columns and 'post_text' in df.columns:
        df_clean['full_text'] = df_clean['blog_title'].astype(str) + " " + df_clean['post_text'].astype(str)
        df_clean['is_political'] = df_clean['full_text'].apply(is_political)
        df_final = df_clean[df_clean['is_political'] == True]
        print(f"Data setelah filter politik: {len(df_final)} baris")
        print(f"   (Dibuang {len(df_clean) - len(df_final)} baris non-politik)")
    else:
        print("ERROR: Kolom 'blog_title'/'post_text' tidak ditemukan. Menggunakan 'title'/'text' untuk filter politik.")
        df_clean['full_text'] = df_clean['title'].astype(str) + " " + df_clean['text'].astype(str)
        df_clean['is_political'] = df_clean['full_text'].apply(is_political)
        df_final = df_clean[df_clean['is_political'] == True]
        print(f"Data after political filter: {len(df_final)} rows")

    # 5. Buang duplikat teks terakhir
    total_before_dedup = len(df_final)
    df_final = df_final.drop_duplicates(subset=['post_text' if 'post_text' in df_final.columns else 'text'], keep='first')
    print(f"Data setelah filter duplikat: {len(df_final)} baris")
    print(f"   (Dibuang {total_before_dedup - len(df_final)} duplikat)")

    # 6. Simpan file bersih
    # Hapus kolom helper sebelum menyimpan
    df_final = df_final.drop(columns=['is_clean', 'full_text', 'is_political'], errors='ignore')
    
    df_final.to_csv(output_path, index=False)
    
    print("\n" + "="*50)
    print("ðŸŽ‰ SUKSES! DATA BERSIH TELAH DISIMPAN.")
    print(f"File bersih Anda adalah: {output_path}")
    print(f"Total data bersih: {len(df_final)}")
    print("Gunakan file ini untuk digabungkan dengan data 'news' Anda.")
    print("="*50)

except FileNotFoundError:
    print(f"ERROR: File {file_path} tidak ditemukan.")
except Exception as e:
    print(f"Terjadi error: {e}")
    print("\n--- Info Data ---")
    df_check = pd.read_csv(file_path)
    print(df_check.info())
    print(df_check.head())

Memuat file: ../data/raw/turnbackhoax/metadata/tbh_complete_dataset1.csv
Total baris dimuat: 1973
ERROR: Kolom 'post_text' tidak ditemukan. Menggunakan 'text'.
Data after anti-leak filter: 1956 rows
ERROR: Kolom 'blog_title'/'post_text' tidak ditemukan. Menggunakan 'title'/'text' untuk filter politik.
Data after political filter: 1186 rows
Data setelah filter duplikat: 1179 baris
   (Dibuang 7 duplikat)

ðŸŽ‰ SUKSES! DATA BERSIH TELAH DISIMPAN.
File bersih Anda adalah: ../data/raw/turnbackhoax/metadata/tbh_BERSIH_POLITIK_SAJA.csv
Total data bersih: 1179
Gunakan file ini untuk digabungkan dengan data 'news' Anda.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['full_text'] = df_clean['title'].astype(str) + " " + df_clean['text'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['is_political'] = df_clean['full_text'].apply(is_political)
