# BERTopic Berita

In [1]:
import sys
!{sys.executable} -m pip install bertopic gensim numpy




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\laragon\bin\python\python-3.10\python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import ast
import gensim
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# ==============================================================================
# Tahap 1 - 4 (Sama seperti sebelumnya, tidak ada perubahan)
# ==============================================================================
print("Tahap 1: Membaca dan memproses file data...")
try:
    df = pd.read_csv('hasil_preprocessing_berita.csv')
    print("File berhasil dimuat.")
except FileNotFoundError:
    print("File tidak ditemukan")
    
df['tokens'] = df['hasil_preprocessing'].apply(ast.literal_eval)
tokenized_docs = df['tokens'].tolist()
print(f"Data siap. Jumlah dokumen: {len(tokenized_docs)}")
print("-" * 30)

print("Tahap 2: Melatih model Word2Vec (100 dimensi)...")
w2v_model = gensim.models.Word2Vec(
    sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4
)
print("Model Word2Vec selesai dilatih.")
print("-" * 30)

print("Tahap 3: Membuat wrapper embedding...")
class GensimEmbedding:
    def __init__(self, model):
        self.model = model
    def embed(self, documents):
        embeddings = []
        for doc_string in documents:
            doc_tokens = doc_string.split()
            word_vectors = [self.model.wv[word] for word in doc_tokens if word in self.model.wv]
            if not word_vectors:
                embeddings.append(np.zeros(self.model.vector_size))
            else:
                embeddings.append(np.mean(word_vectors, axis=0))
        return np.array(embeddings)
custom_embedding_model = GensimEmbedding(w2v_model)
print("Wrapper embedding siap digunakan.")
print("-" * 30)

print("Tahap 4: Melatih model BERTopic...")
vectorizer_model = CountVectorizer(stop_words=["dalam", "dan", "di", "untuk", "pada", "dengan", "yang", "itu", "ini"])
docs_for_bertopic = [" ".join(tokens) for tokens in tokenized_docs]
topic_model = BERTopic(
    embedding_model=custom_embedding_model,
    vectorizer_model=vectorizer_model,
    verbose=True
)
topics, probabilities = topic_model.fit_transform(docs_for_bertopic)
print("Model BERTopic selesai dilatih!")
print("-" * 30)

# ==============================================================================
# Tahap 5: Tampilkan Hasil (Tidak ada perubahan)
# ==============================================================================
print("Tahap 5: Menampilkan hasil topik...")
topic_info = topic_model.get_topic_info() # Simpan info topik ke variabel
print("Daftar Topik yang Ditemukan:")
print(topic_info)
print("-" * 30)

# ==============================================================================
# Tahap 6: Simpan Hasil ke File CSV ðŸ’¾
# ==============================================================================
print("Tahap 6: Menyimpan hasil ke file CSV...")

# --- 1. Menyimpan Informasi Umum Setiap Topik ---
# File ini berisi daftar topik, jumlah dokumen per topik, dan kata kuncinya.
topic_info.to_csv("informasi_topik_berita.csv", index=False)
print("File 'informasi_topik_berita.csv' berhasil disimpan.")

# --- 2. Menyimpan Hasil Topik untuk Setiap Dokumen ---
# Kita buat DataFrame baru yang berisi dokumen asli dan hasil topiknya.
# `topics` adalah list nomor topik untuk setiap dokumen.
# `probabilities` adalah list probabilitas untuk setiap dokumen.
hasil_df = pd.DataFrame({
    "Dokumen": docs_for_bertopic, # Anda juga bisa menggunakan kolom asli dari df awal
    "Topik_ID": topics,
    "Probabilitas": probabilities
})

# Untuk membuatnya lebih informatif, kita bisa gabungkan dengan kata kunci topiknya
# Menggabungkan (merge) dengan topic_info berdasarkan 'Topik_ID'
hasil_df = pd.merge(hasil_df, topic_info[['Topic', 'Name', 'Representation']], left_on='Topik_ID', right_on='Topic', how='left')
hasil_df = hasil_df.drop(columns=['Topic']) # Hapus kolom 'Topic' yang redundant

hasil_df.to_csv("hasil_topik_per_dokumen_berita.csv", index=False)
print("File 'hasil_topik_per_dokumen_berita.csv' berhasil disimpan.")
print("-" * 30)
print("Proses selesai!")

  from .autonotebook import tqdm as notebook_tqdm
