In [None]:

# CELL 1: INSTALL & IMPORT (UPDATE)
# Install library
!pip install -q bertopic pandas requests matplotlib plotly nltk

from bertopic import BERTopic
import pandas as pd
import os
import requests
import re
import nltk

# --- PERBAIKAN DISINI ---
# Kita download semua resource NLTK yang wajib
nltk.download('punkt')
nltk.download('punkt_tab') # <--- INI YANG KURANG TADI
nltk.download('stopwords')

print("‚úÖ Setup & Download NLTK Selesai!")

# CELL 2: LOAD, CHUNK, CLEAN & TRAIN
import csv
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize # Import di atas biar rapi

# Hapus model lama jika ada biar RAM bersih
if 'topic_model' in locals(): del topic_model

# === KONFIGURASI SUMBER DATA ===
source_path = "https://github.com/rhnrafif/datamining_1/blob/main/data/dataset_pidato_UN.csv"

# === 1. DEFINISI STOPWORDS ===
indo_stopwords = stopwords.words('indonesian')

# Stopwords Custom
custom_stopwords = [
    'yang', 'di', 'dan', 'ini', 'itu', 'dari', 'ke', 'pada', 'untuk', 'adalah', 
    'sebagai', 'dengan', 'dalam', 'juga', 'karena', 'bahwa', 'tersebut', 'oleh', 
    'atau', 'sudah', 'saya', 'kita', 'kami', 'mereka', 'anda', 'dia',
    'bapak', 'ibu', 'saudara', 'hadirin', 'sekalian', 'terima', 'kasih', 
    'assalamualaikum', 'waalaikumsalam', 'warahmatullahi', 'wabarakatuh',
    'salam', 'hormat', 'selamat', 'pagi', 'siang', 'sore', 'malam',
    'yang', 'mulia', 'para', 'presiden', 'indonesia', 'bangsa', 'negara'
]

final_stopwords = list(set(indo_stopwords + custom_stopwords))

def get_raw_url(github_url):
    if 'github.com' in github_url and '/blob/' in github_url:
        return github_url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')
    return github_url

# --- FUNGSI CHUNKER ---
def split_long_text(text):
    text = str(text)
    text = text.replace('"""', '"').replace('""', '"')
    
    # 1. Pecah per Enter (Paragraf)
    chunks = text.split('\n')
    
    # 2. Fallback: Pecah per Kalimat jika tidak ada enter
    if len(chunks) < 2:
        # Ini akan menggunakan 'punkt_tab' yang baru didownload
        chunks = sent_tokenize(text)

    clean_chunks = []
    for chunk in chunks:
        chunk = chunk.strip()
        if len(chunk) > 30: 
            clean_chunks.append(chunk)
    return clean_chunks

try:
    print(f"üîç Memeriksa sumber data: {source_path}")
    
    # --- LOAD DATA ---
    load_params = {'sep': 'üíæ', 'engine': 'python', 'header': None, 'names': ['text']}

    if source_path.startswith('http'):
        df_raw = pd.read_csv(get_raw_url(source_path), **load_params)
    else:
        try:
             df_raw = pd.read_csv(source_path, **load_params)
        except:
             df_raw = pd.read_csv(source_path, on_bad_lines='skip', engine='python', names=['text'])

    col_text = df_raw.columns[0]
    
    # --- PROSES CHUNKING ---
    print("‚è≥ Sedang memecah pidato panjang...")
    all_paragraphs = []
    for full_speech in df_raw[col_text]:
        paragraphs = split_long_text(full_speech)
        all_paragraphs.extend(paragraphs)
    
    df_clean = pd.DataFrame(all_paragraphs, columns=['text'])
    print(f"‚úÖ Siap Analisis: {len(df_clean)} Paragraf.")

    # --- TRAINING BERTOPIC ---
    vectorizer_model = CountVectorizer(stop_words=final_stopwords)

    print("\n‚è≥ Sedang melatih BERTopic...")
    
    docs = df_clean['text'].tolist()
    
    topic_model = BERTopic(
        language="multilingual", 
        vectorizer_model=vectorizer_model, 
        verbose=True
    )
    
    topics, probs = topic_model.fit_transform(docs)
    print("‚úÖ Training Selesai!")

    # --- HASIL ---
    print("\n=== INFO TOPIK ===")
    display(topic_model.get_topic_info().head(10)) 

    print("\n=== KATA KUNCI UTAMA (Topik 0) ===")
    print(topic_model.get_topic(0))

except Exception as e:
    print(f"‚ùå Error: {e}")

if 'topic_model' in locals():
    print("üìä Visualisasi Data...")
    try:
        # Barchart kata kunci
        fig1 = topic_model.visualize_barchart(top_n_topics=8)
        fig1.show()
        
        # Peta jarak antar topik (Intertopic Distance)
        fig2 = topic_model.visualize_topics()
        fig2.show()
    except Exception as e:
        print(f"Gagal visualisasi: {e}")