In [None]:

# 1. INSTALASI & IMPORT
print("‚è≥ Menginstall library...")
!pip install -q pandas scikit-learn plotly Sastrawi matplotlib

import pandas as pd
import os
import re
import string
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# 2. LOAD DATA (AUTO DOWNLOAD)
print("üì• Mengambil data...")
if not os.path.exists('data'):
    os.makedirs('data')

# Download data dari GitHub kamu
!rm -f data/data_aturan.csv
!wget -q -O data/data_aturan.csv https://raw.githubusercontent.com/rhnrafif/datamining_1/main/data/data_aturan.csv

# Baca CSV
df = pd.read_csv('data/data_aturan.csv')
print(f"‚úÖ Data dimuat: {len(df)} baris")

# 3. PREPROCESSING (SAMA SEPERTI CHATBOT)
print("‚öôÔ∏è Melakukan Preprocessing Teks...")

factory_stop = StopWordRemoverFactory()
stopword = factory_stop.create_stop_word_remover()
factory_stem = StemmerFactory()
stemmer = factory_stem.create_stemmer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = stopword.remove(text)
    text = stemmer.stem(text)
    return text

# Kita gabungkan pertanyaan dan jawaban untuk dianalisis
df['text_asli'] = df['pertanyaan'] + " | " + df['jawaban']
df['text_clean'] = df['text_asli'].apply(clean_text)

# 4. TF-IDF & PCA PROCESS
print("üßÆ Menghitung Vektor & PCA...")

# 1. Ubah Teks ke Angka (Vectorization)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_clean'])

# 2. Kurangi Dimensi jadi 2 (X dan Y) pakai PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X.toarray())

# Simpan hasil PCA ke DataFrame
df['x_pca'] = pca_result[:, 0]
df['y_pca'] = pca_result[:, 1]

# 3. (Opsional) Kita Cluster warnanya biar bagus (K-Means)
# Misal kita bagi jadi 3 kelompok tema
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X)
df['cluster'] = df['cluster'].astype(str) # Biar jadi kategori warna

# 5. VISUALISASI INTERAKTIF
print("üìä Menampilkan Grafik...")

fig = px.scatter(
    df, 
    x='x_pca', 
    y='y_pca', 
    color='cluster', # Warna berdasarkan kemiripan
    hover_data=['pertanyaan'], # Pas mouse nempel, muncul pertanyaannya
    title='Visualisasi Sebaran Data HRD (PCA Projection)',
    labels={'x_pca': 'Principal Component 1', 'y_pca': 'Principal Component 2'},
    symbol='cluster',
    size_max=10
)

fig.update_traces(marker=dict(size=12, line=dict(width=2, color='DarkSlateGrey')))
fig.show()


print("\n" + "="*40)
print("üìÑ HASIL DATA PCA (5 Baris Pertama)")
print("="*40)

# 1. Membuat DataFrame khusus tampilan (biar mirip request kamu)
output_df = pd.DataFrame({
    'PC1': df['x_pca'],
    'PC2': df['y_pca'],
    'target': df['cluster'] # Cluster otomatis dari K-Means
})

# 2. Tampilkan tabelnya
print(output_df.head())

print("\n" + "="*40)
print("üìà EXPLAINED VARIANCE RATIO")
print("="*40)

# 3. Tampilkan rasio variansi
# Ini menunjukkan seberapa banyak informasi yang tersimpan di PC1 dan PC2
print(pca.explained_variance_ratio_)