In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

!pip install pandas gensim nltk pyLDAvis wordcloud plotly matplotlib requests


import pandas as pd
import numpy as np
import re
import os
import requests
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
import gensim.corpora as corpora
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import pyLDAvis
import pyLDAvis.gensim_models

# Download resource NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

print("‚úÖ Library siap digunakan.")
# CELL 2: LOAD DATA & PREPROCESSING
if 'df' in locals(): del df

# === KONFIGURASI SUMBER DATA ===
# Ganti dengan path lokal ATAU URL GitHub Raw
# Contoh data kamu: text,label (CSV)
# source_path = "data/dataset_pidato_3k.csv" 
source_path = "https://github.com/rhnrafif/datamining_1/blob/main/data/dataset_pidato_3k.csv" 

def get_raw_url(github_url):
    if 'github.com' in github_url and '/blob/' in github_url:
        return github_url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')
    return github_url

# --- STOPWORDS SETUP ---
stop_words = set(stopwords.words('indonesian'))
custom_slang = {'yg', 'gak', 'ga', 'kalo', 'kl', 'bgt', 'dr', 'dlm', 'tdk', 'jd', 'jgn', 'sdh', 'aja', 'n', 't', 'ny', 'sy', 'aku', 'saya', 'kamu', 'dia', 'ini', 'itu', 'dan', 'di', 'ke', 'dari', 'yang', 'pada', 'untuk', 'adalah', 'sebagai'}
stop_words = stop_words.union(custom_slang)

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text) # Hapus tanda baca
    text = re.sub(r'\d+', '', text)     # Hapus angka
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words and len(word) > 2]

try:
    print(f"üîç Memeriksa sumber data: {source_path}")
    
    # 1. LOAD DATA
    if source_path.startswith('http'):
        print("üåç URL Terdeteksi. Download data...")
        raw_url = get_raw_url(source_path)
        df = pd.read_csv(raw_url)
    else:
        print("üìÇ File Lokal Terdeteksi...")
        if not os.path.exists(source_path):
            # Auto-search path
            for root, dirs, files in os.walk("."):
                if os.path.basename(source_path) in files:
                    source_path = os.path.join(root, os.path.basename(source_path))
                    print(f"   ‚ö†Ô∏è Path dikoreksi ke: {source_path}")
                    break
        df = pd.read_csv(source_path)

    # Auto-detect text column
    col_text = next((c for c in df.columns if c.lower() in ['text', 'tweet', 'content', 'review']), df.columns[0])
    print(f"‚úÖ Data Loaded ({len(df)} baris). Kolom Teks: '{col_text}'")

    # 2. PREPROCESSING & TOKENISASI
    print("‚è≥ Membersihkan teks & Tokenisasi...")
    data_words = df[col_text].apply(clean_text).tolist()

    # 3. BUILD BIGRAMS (Opsional tapi Penting)
    # Mengubah "rumah" + "sakit" menjadi "rumah_sakit" jika sering muncul bersama
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    data_ready = [bigram_mod[doc] for doc in data_words]
    
    print(f"‚úÖ Preprocessing Selesai. Contoh data baris pertama:\n{data_ready[0]}")

except Exception as e:
    print(f"‚ùå Error: {e}")
# CELL 3: TRAINING LDA MODEL
try:
    # 1. Create Dictionary & Corpus
    id2word = corpora.Dictionary(data_ready)
    
    # Filter kata yang terlalu jarang (muncul di <2 dokumen) atau terlalu umum (muncul di >90% dokumen)
    id2word.filter_extremes(no_below=2, no_above=0.9)
    
    corpus = [id2word.doc2bow(text) for text in data_ready]

    # 2. Train LDA Model
    # Ganti num_topics sesuai kebutuhan (misal 3, 5, atau 10)
    NUM_TOPICS = 3 
    
    print(f"‚è≥ Sedang melatih LDA dengan {NUM_TOPICS} topik...")
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=NUM_TOPICS, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           per_word_topics=True)

    print("‚úÖ Training Selesai!")
    
    # 3. Tampilkan Keyword per Topik
    print("\n=== KATA KUNCI PER TOPIK ===")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topik {idx}: {topic}")

except Exception as e:
    print(f"Error Training: {e}")
# CELL 4: VISUALISASI INTERAKTIF (pyLDAvis)
# Enable notebook mode
pyLDAvis.enable_notebook()

# Prepare visualization data
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

print("Visualisasi Interaktif (Geser mouse ke bubble untuk melihat detail topik):")
vis
# CELL 5: WORD CLOUD & BAR CHART PER TOPIK (FIXED)
import matplotlib.pyplot as plt

# FIX: Langsung ambil list warnanya saja
cols = plt.get_cmap('tab10').colors 

# Loop setiap topik
for t in range(lda_model.num_topics):
    plt.figure(figsize=(16, 6))
    
    # Ambil kata-kata untuk topik ini
    # Format lda_model.show_topic: [('kata1', 0.1), ('kata2', 0.05)]
    topic_words = dict(lda_model.show_topic(t, 20))
    
    # --- 1. WORD CLOUD (KIRI) ---
    plt.subplot(1, 2, 1)
    wordcloud = WordCloud(background_color='white', width=800, height=600, max_words=50)
    wordcloud.generate_from_frequencies(topic_words)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f'Word Cloud - Topik {t}', fontsize=16)
    
    # --- 2. BAR CHART (KANAN) ---
    plt.subplot(1, 2, 2)
    df_chart = pd.DataFrame(list(topic_words.items()), columns=['Word', 'Weight'])
    
    # Gunakan warna dari palet tab10 secara bergiliran
    color_idx = t % len(cols)
    plt.barh(df_chart['Word'], df_chart['Weight'], color=cols[color_idx])
    
    plt.gca().invert_yaxis() # Kata bobot terbesar di atas
    plt.title(f'Kata Kunci Dominan - Topik {t}', fontsize=16)
    
    plt.tight_layout()
    plt.show()
# CELL 6: DOMINANT TOPIC & RELATION WITH LABEL
# 1. Cari Topik Dominan untuk setiap dokumen
def format_topics_sentences(ldamodel, corpus, texts):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([[int(topic_num), round(prop_topic,4)]])], ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution']
    return sent_topics_df

df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, data_ready)

# Gabungkan dengan Dataframe Asli
df_dominant = pd.concat([df_topic_sents_keywords, df], axis=1)

print("\n=== CONTOH DATA DENGAN PREDIKSI TOPIK ===")
display(df_dominant[['Dominant_Topic', 'Perc_Contribution', col_text]].head())

# 2. Visualisasi Hubungan Topik vs Label Manual (Jika ada kolom label)
col_label = next((c for c in df.columns if c.lower() in ['label', 'kategori', 'category']), None)

if col_label:
    print(f"\n=== HUBUNGAN TOPIK AI vs LABEL MANUAL '{col_label}' ===")
    
    # Buat Crosstab
    crosstab = pd.crosstab(df_dominant[col_label], df_dominant['Dominant_Topic'])
    
    # Plot Heatmap
    import seaborn as sns
    plt.figure(figsize=(10, 6))
    sns.heatmap(crosstab, annot=True, fmt='d', cmap='YlGnBu')
    plt.title(f"Heatmap: Label Asli vs Topik LDA")
    plt.ylabel("Label Manual")
    plt.xlabel("Topik Hasil LDA")
    plt.show()
    
    print("Insight: Warna gelap menunjukkan konsentrasi dokumen. Jika label manual 'Ekonomi' banyak masuk ke Topik 0, berarti Topik 0 membahas Ekonomi.")
else:
    print("Kolom label tidak ditemukan untuk perbandingan.")