In [None]:

# 1. INSTALASI & IMPORT
print("Menginstall library WordCloud & lainnya...")
!pip install -q Sastrawi pandas scikit-learn matplotlib seaborn nltk wordcloud requests

import pandas as pd
import numpy as np
import os
import re
import string
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from wordcloud import WordCloud


In [None]:

# 2. DOWNLOAD DATA (Python Native)
# Link asli (Dataset Pidato UN)
original_url = "https://github.com/rhnrafif/datamining_1/blob/main/data/dataset_pidato_UN.csv"
raw_url = original_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

print(f"Mendownload pidato...")
if not os.path.exists('data'):
    os.makedirs('data')

try:
    response = requests.get(raw_url)
    with open('data/pidato_prabowo.csv', 'wb') as f:
        f.write(response.content)
    
    # Baca konten file
    with open('data/pidato_prabowo.csv', 'r', encoding='utf-8') as f:
        full_text = f.read()
    print("Download Berhasil!")
except Exception as e:
    print(f"‚ùå Gagal download (Coba cek koneksi): {e}")
    full_text = ""


In [None]:

# 3. CHUNKING & PREPROCESSING
print("Memproses teks...")

def split_paragraphs(text):
    text = text.replace('"""', '').replace('""', '')
    parts = text.split('\n')
    clean_parts = []
    for p in parts:
        p = p.strip()
        if len(p) > 30: # Ambil paragraf > 30 huruf
            clean_parts.append(p)
    return clean_parts

df = pd.DataFrame(split_paragraphs(full_text), columns=['text_original'])

# Setup Sastrawi
factory_stop = StopWordRemoverFactory()
stopword = factory_stop.create_stop_word_remover()
factory_stem = StemmerFactory()
stemmer = factory_stem.create_stemmer()

# Stopwords tambahan agar WordCloud bersih
custom_ignore = ['yang', 'mulia', 'para', 'hadirin', 'sekalian', 'tuan', 'pbb', 'presiden', 'di', 'dan', 'ke', 'dari', 'itu', 'ini', 'adalah', 'kami', 'kita', 'saya']

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = stopword.remove(text)
    for word in custom_ignore: # Hapus kata umum
        text = text.replace(f" {word} ", " ")
    text = stemmer.stem(text)
    return text

df['text_clean'] = df['text_original'].apply(clean_text)


In [None]:

# 4. K-MEANS CLUSTERING
print("Menghitung Cluster...")
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_clean'])

JUMLAH_CLUSTER = 3
kmeans = KMeans(n_clusters=JUMLAH_CLUSTER, random_state=42)
kmeans.fit(X)
df['cluster'] = kmeans.labels_


In [None]:

# 5. VISUALISASI SCATTER PLOT
print("Membuat Grafik Sebaran...")
pca = PCA(n_components=2)
coords = pca.fit_transform(X.toarray())

plt.figure(figsize=(10, 6))
sns.scatterplot(x=coords[:,0], y=coords[:,1], hue=df['cluster'], palette='viridis', s=100, style=df['cluster'])
plt.title(f'Peta Topik Pidato ({JUMLAH_CLUSTER} Cluster)')
plt.show()


In [None]:

# 6. VISUALISASI WORD CLOUD
print("WORD CLOUD PER TOPIK")

# Loop untuk setiap cluster
for i in range(JUMLAH_CLUSTER):
    print(f"\nüìÇ KELOMPOK TOPIK {i}:")
    
    # Gabungkan semua teks dalam cluster ini menjadi satu string raksasa
    subset = df[df['cluster'] == i]
    text_gabungan = " ".join(subset['text_clean'])
    
    if len(text_gabungan) > 0:
        # Bikin Word Cloud
        wc = WordCloud(
            background_color='white',
            width=800, 
            height=400,
            colormap='Dark2', # Warna teks
            stopwords=custom_ignore
        ).generate(text_gabungan)
        
        # Tampilkan Gambar
        plt.figure(figsize=(10, 5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off") # Hilangkan sumbu X/Y
        plt.title(f"Kata Kunci Dominan - Kelompok {i}")
        plt.show()
        
        # Tampilkan contoh kalimat asli biar user paham konteksnya
        print("   üó£Ô∏è Contoh Kalimat Asli:")
        print(f"   - \"{subset['text_original'].iloc[0][:150]}...\"")
    else:
        print("   (Data terlalu sedikit untuk membuat Word Cloud)")