In [None]:
# Install library
!pip install pandas gensim nltk

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora

# Download resource NLTK untuk preprocessing
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
def get_raw_url(github_url):
    if 'github.com' in github_url and '/blob/' in github_url:
        return github_url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')
    return github_url

# === CONFIG URL ===
url_file_github = "https://github.com/USERNAME_KAMU/NAMA_REPO/blob/main/data.csv"

try:
    print("⏳ Loading data...")
    df = pd.read_csv(get_raw_url(url_file_github))
    
    # Auto-detect text column
    col_text = next((c for c in df.columns if c.lower() in ['text', 'tweet', 'content', 'review']), df.columns[0])
    print(f"✅ Data Loaded. Target Column: {col_text}")

    # 1. Preprocessing Sederhana
    print("⏳ Preprocessing...")
    stop_words = set(stopwords.words('indonesian')) # Ganti 'english' jika datanya inggris
    
    def clean_text(text):
        tokens = word_tokenize(str(text).lower())
        # Hapus angka, tanda baca, dan stopwords
        return [word for word in tokens if word.isalnum() and word not in stop_words]

    processed_docs = df[col_text].apply(clean_text)

    # 2. Buat Dictionary & Corpus
    dictionary = corpora.Dictionary(processed_docs)
    corpus = [dictionary.doc2bow(text) for text in processed_docs]

    # 3. Training LDA
    print("⏳ Training LDA Model (3 Topik)...")
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=dictionary,
                                               num_topics=3, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)

    # 4. Tampilkan Hasil
    print("\n=== HASIL TOPIK LDA ===")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topik {idx}: {topic}\n")

except Exception as e:
    print(f"Error: {e}")