In [None]:

# 1. INSTALASI & IMPORT
print("Menginstall library NLP & Plotting...")
!pip install -q Sastrawi pandas scikit-learn matplotlib seaborn nltk

import pandas as pd
import numpy as np
import os
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


In [None]:

# 2. DOWNLOAD DATA (AUTO RAW URL)
# Link asli dari kamu (masih format blob/html)
original_url = "https://github.com/rhnrafif/datamining_1/blob/main/data/dataset_pidato_UN.csv"

# Ubah otomatis ke format RAW agar bisa didownload mesin
raw_url = original_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

print(f"Mendownload pidato dari: {raw_url}")
if not os.path.exists('data'):
    os.makedirs('data')

# Download pakai wget biar stabil
!wget -q -O data/pidato_prabowo.csv {raw_url}


In [None]:

# 3. LOAD & CHUNK (MEMECAH PIDATO)
print("Memproses & Memecah Pidato...")

# Baca file sebagai text biasa (bukan CSV kolom) karena isinya pidato panjang
try:
    with open('data/pidato_prabowo.csv', 'r', encoding='utf-8') as f:
        full_text = f.read()
except:
    # Fallback encoding lain jika utf-8 gagal
    with open('data/pidato_prabowo.csv', 'r', encoding='latin-1') as f:
        full_text = f.read()

# FUNGSI PEMECAH PARAGRAF
def split_paragraphs(text):
    # Bersihkan artifact CSV (tanda kutip ganda berlebih)
    text = text.replace('"""', '').replace('""', '')
    
    # Pecah berdasarkan Baris Baru (Enter)
    parts = text.split('\n')
    
    clean_parts = []
    for p in parts:
        p = p.strip()
        # Ambil paragraf yang bermakna (lebih dari 50 huruf)
        # Biar judul pendek/salam pembuka ga jadi noise
        if len(p) > 30: 
            clean_parts.append(p)
    return clean_parts

paragraphs = split_paragraphs(full_text)
df = pd.DataFrame(paragraphs, columns=['text_original'])

print(f"Pidato berhasil dipecah menjadi: {len(df)} Paragraf/Bagian.")
print("Contoh data awal:")
print(df.head(3))


In [None]:

# 4. PREPROCESSING (SASTRAWI)
print("\nüßπ Membersihkan teks (Stopwords & Stemming)...")

factory_stop = StopWordRemoverFactory()
stopword = factory_stop.create_stop_word_remover()
factory_stem = StemmerFactory()
stemmer = factory_stem.create_stemmer()

# Tambahan stopwords khusus pidato (biar ga muncul di cluster)
custom_ignore = ['yang', 'mulia', 'para', 'hadirin', 'sekalian', 'tuan', 'pbb', 'presiden', 'di', 'dan', 'ke', 'dari']

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text) # Hapus angka
    text = text.translate(str.maketrans('', '', string.punctuation)) # Hapus simbol
    
    # Hapus stopwords bawaan
    text = stopword.remove(text)
    
    # Hapus stopwords custom
    for word in custom_ignore:
        text = text.replace(f" {word} ", " ")
    
    # Stemming (Cukup lama, sabar ya)
    text = stemmer.stem(text)
    return text

# Terapkan ke semua paragraf
df['text_clean'] = df['text_original'].apply(clean_text)


In [None]:

# 5. TF-IDF & K-MEANS
print("Menghitung K-Means...")

# Ubah huruf jadi angka
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_clean'])

# KITA BAGI JADI 3 TOPIK UTAMA (Misal: Pembukaan/Diplomasi, Sejarah/Kolonialisme, Tantangan Masa Depan)
JUMLAH_CLUSTER = 3 

kmeans = KMeans(n_clusters=JUMLAH_CLUSTER, random_state=42)
kmeans.fit(X)
df['cluster'] = kmeans.labels_


In [None]:

# 6. VISUALISASI HASIL (SCATTER PLOT)
print("\n" + "="*40)
print("PETA TOPIK PIDATO")
print("="*40)

# Gunakan PCA untuk bikin koordinat X dan Y
pca = PCA(n_components=2)
coords = pca.fit_transform(X.toarray())

plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=coords[:,0], 
    y=coords[:,1], 
    hue=df['cluster'], 
    palette='viridis', 
    s=100,
    style=df['cluster']
)

plt.title(f'Sebaran Topik Pidato Presiden (K={JUMLAH_CLUSTER})')
plt.xlabel('Dimensi Topik 1')
plt.ylabel('Dimensi Topik 2')
plt.legend(title='Kelompok Topik')
plt.grid(True, alpha=0.3)
plt.show()


In [None]:

# 7. INTERPRETASI (APA ISI CLUSTERNYA?)
print("\n" + "="*40)
print("BEDAH ISI PIDATO PER KELOMPOK")
print("="*40)

# Fungsi cari kata kunci per cluster
def get_top_keywords(data, clusters, labels, n_terms=5):
    df_temp = pd.DataFrame(data.todense()).groupby(clusters).mean()
    terms = vectorizer.get_feature_names_out()
    for i, r in df_temp.iterrows():
        print(f"\nüìÇ KELOMPOK {i}:")
        print(f"   (Kata Kunci Dominan: {', '.join([terms[t] for t in np.argsort(r)[-n_terms:]])})")
        print("-" * 20)
        # Tampilkan contoh kalimat asli
        contoh = df[df['cluster'] == i]['text_original'].head(2).tolist()
        for c in contoh:
            print(f"   üó£Ô∏è \"{c[:100]}...\"")

get_top_keywords(X, df['cluster'], kmeans.labels_)