In [None]:

# CELL 1: IMPORT LIBRARY (BAWAAN COLAB)
# Tidak perlu pip install library aneh-aneh
import pandas as pd
import numpy as np
import os
import requests
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF


In [None]:

# CELL 2: LOAD DATA & PREPROCESSING
if 'df' in locals(): del df

# === CONFIG URL ===
# source_path = "data/dataset_pidato_3k.csv"
source_path = "https://github.com/rhnrafif/datamining_1/blob/main/data/dataset_pidato_3k.csv"

def get_raw_url(github_url):
    if 'github.com' in github_url and '/blob/' in github_url:
        return github_url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')
    return github_url

try:
    print(f"üîç Memeriksa sumber data: {source_path}")
    
    # Load Data
    if source_path.startswith('http'):
        df = pd.read_csv(get_raw_url(source_path))
    else:
        if not os.path.exists(source_path):
             for root, dirs, files in os.walk("."):
                if os.path.basename(source_path) in files:
                    source_path = os.path.join(root, os.path.basename(source_path))
                    break
        df = pd.read_csv(source_path)

    # Auto-detect text column
    col_text = next((c for c in df.columns if c.lower() in ['text', 'tweet', 'content', 'review']), df.columns[0])
    print(f"‚úÖ Data Loaded. Kolom Teks: '{col_text}'")

    # Stopwords sederhana (Manual biar ga perlu download NLTK pun bisa jalan)
    stop_words = {'dan', 'di', 'ke', 'dari', 'yang', 'pada', 'untuk', 'adalah', 'sebagai', 
                  'yg', 'gak', 'ga', 'kalo', 'kl', 'bgt', 'dr', 'dlm', 'tdk', 'jd', 'jgn', 
                  'sdh', 'aja', 'n', 't', 'ny', 'sy', 'aku', 'saya', 'kamu', 'dia', 'ini', 'itu'}

    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', '', text) 
        text = re.sub(r'\d+', '', text)
        return " ".join([w for w in text.split() if w not in stop_words and len(w)>2])

    df['clean_text'] = df[col_text].apply(clean_text)
    print("‚úÖ Preprocessing Selesai.")

except Exception as e:
    print(f"‚ùå Error Load Data: {e}")


In [None]:

# CELL 3: TRAINING MODEL (NMF - Scikit Learn)
try:
    print("‚è≥ Sedang melatih model NMF")
    
    # 1. Ubah teks jadi Angka (TF-IDF)
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
    tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])
    
    # 2. Jalankan NMF (Mencari pola topik)
    NUM_TOPICS = 3
    nmf_model = NMF(n_components=NUM_TOPICS, random_state=1, l1_ratio=.5, init='nndsvd').fit(tfidf)
    
    print("‚úÖ Training Selesai!")
    
    # Fungsi menampilkan topik
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    topic_data = {}
    for topic_idx, topic in enumerate(nmf_model.components_):
        # Ambil 20 kata teratas
        top_indices = topic.argsort()[:-21:-1]
        top_words = [feature_names[i] for i in top_indices]
        top_weights = [topic[i] for i in top_indices]
        topic_data[topic_idx] = dict(zip(top_words, top_weights))
        print(f"Topik {topic_idx}: {', '.join(top_words[:5])}...")

except Exception as e:
    print(f"‚ùå Error Training: {e}")


In [None]:

# CELL 4: VISUALISASI (Word Cloud & Bar Chart)
# Warna visualisasi
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

for t in range(NUM_TOPICS):
    if t in topic_data:
        plt.figure(figsize=(16, 6))
        
        # Data untuk chart
        words_dict = topic_data[t]
        
        # 1. Word Cloud
        plt.subplot(1, 2, 1)
        wordcloud = WordCloud(background_color='white', width=800, height=600).generate_from_frequencies(words_dict)
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.title(f'Word Cloud - Topik {t}', fontsize=16)
        
        # 2. Bar Chart
        plt.subplot(1, 2, 2)
        items = list(words_dict.items())[:10] # Top 10 kata
        plt.barh([x[0] for x in items], [x[1] for x in items], color=colors[t % len(colors)])
        plt.gca().invert_yaxis()
        plt.title(f'Kata Kunci Dominan - Topik {t}', fontsize=16)
        
        plt.tight_layout()
        plt.show()


In [None]:

# CELL 5: PREDIKSI TOPIK PADA DOKUMEN
# Prediksi topik untuk setiap dokumen
topic_values = nmf_model.transform(tfidf)
df['Dominant_Topic'] = topic_values.argmax(axis=1)

print("\n=== CONTOH HASIL PREDIKSI ===")
display(df[[col_text, 'Dominant_Topic']].head())

# Cek distribusi topik
import seaborn as sns
plt.figure(figsize=(8,4))
sns.countplot(x='Dominant_Topic', data=df)
plt.title('Jumlah Dokumen per Topik')
plt.show()