In [1]:
!pip install bertopic gensim numpy

Collecting bertopic
  Obtaining dependency information for bertopic from https://files.pythonhosted.org/packages/98/05/2d6b305391efff89c2b4cf19cf847f971ca163eb5c149d0d2ffac0a9c7ed/bertopic-0.17.3-py3-none-any.whl.metadata
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Obtaining dependency information for hdbscan>=0.8.29 from https://files.pythonhosted.org/packages/c0/cb/6b4254f8a33e075118512e55acf3485c155ea52c6c35d69a985bdc59297c/hdbscan-0.8.40-cp312-cp312-win_amd64.whl.metadata
  Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Obtaining dependency information for umap-learn>=0.5.0 from https://files.pythonhosted.org/packages/6b/b1/c24deeda9baf1fd491aaad941ed89e0fed6c583a117fd7b79e0a33a1e6c0/umap_learn-0.5.9.post2-py3-none-any.whl.metadata
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting plotly>=4.7.0 (from bertopic)
  Obtai


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
import ast
import gensim
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# ==============================================================================
# Tahap 1 - 4 (Sama seperti sebelumnya, tidak ada perubahan)
# ==============================================================================
print("Tahap 1: Membaca dan memproses file data...")
try:
    df = pd.read_csv('hasil_preprocessing_berita.csv')
    print("File 'data_berita_bersih.csv' berhasil dimuat.")
except FileNotFoundError:
    print("File tidak ditemukan. Membuat data tiruan untuk demonstrasi.")
    data_contoh = {
        'hasil_preprocessing': [
            "['sidoarjobangsaonlinecom', 'proses', 'cari', 'korban', 'santri']",
            "['probolinggo', 'bangsaonlinecom', 'polres', 'probolinggo', 'kota']"
        ]
    }
    df = pd.DataFrame(data_contoh)
    
df['tokens'] = df['hasil_preprocessing'].apply(ast.literal_eval)
tokenized_docs = df['tokens'].tolist()
print(f"Data siap. Jumlah dokumen: {len(tokenized_docs)}")
print("-" * 30)

print("Tahap 2: Melatih model Word2Vec (100 dimensi)...")
w2v_model = gensim.models.Word2Vec(
    sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4
)
print("Model Word2Vec selesai dilatih.")
print("-" * 30)

print("Tahap 3: Membuat wrapper embedding...")
class GensimEmbedding:
    def __init__(self, model):
        self.model = model
    def embed(self, documents):
        embeddings = []
        for doc_string in documents:
            doc_tokens = doc_string.split()
            word_vectors = [self.model.wv[word] for word in doc_tokens if word in self.model.wv]
            if not word_vectors:
                embeddings.append(np.zeros(self.model.vector_size))
            else:
                embeddings.append(np.mean(word_vectors, axis=0))
        return np.array(embeddings)
custom_embedding_model = GensimEmbedding(w2v_model)
print("Wrapper embedding siap digunakan.")
print("-" * 30)

print("Tahap 4: Melatih model BERTopic...")
vectorizer_model = CountVectorizer(stop_words=["dalam", "dan", "di", "untuk", "pada", "dengan", "yang", "itu", "ini"])
docs_for_bertopic = [" ".join(tokens) for tokens in tokenized_docs]
topic_model = BERTopic(
    embedding_model=custom_embedding_model,
    vectorizer_model=vectorizer_model,
    verbose=True
)
topics, probabilities = topic_model.fit_transform(docs_for_bertopic)
print("Model BERTopic selesai dilatih!")
print("-" * 30)

# ==============================================================================
# Tahap 5: Tampilkan Hasil (Tidak ada perubahan)
# ==============================================================================
print("Tahap 5: Menampilkan hasil topik...")
topic_info = topic_model.get_topic_info() # Simpan info topik ke variabel
print("Daftar Topik yang Ditemukan:")
print(topic_info)
print("-" * 30)

# ==============================================================================
# Tahap 6: Simpan Hasil ke File CSV 💾
# ==============================================================================
print("Tahap 6: Menyimpan hasil ke file CSV...")

# --- 1. Menyimpan Informasi Umum Setiap Topik ---
# File ini berisi daftar topik, jumlah dokumen per topik, dan kata kuncinya.
topic_info.to_csv("informasi_topik.csv", index=False)
print("File 'informasi_topik.csv' berhasil disimpan.")

# --- 2. Menyimpan Hasil Topik untuk Setiap Dokumen ---
# Kita buat DataFrame baru yang berisi dokumen asli dan hasil topiknya.
# `topics` adalah list nomor topik untuk setiap dokumen.
# `probabilities` adalah list probabilitas untuk setiap dokumen.
hasil_df = pd.DataFrame({
    "Dokumen": docs_for_bertopic, # Anda juga bisa menggunakan kolom asli dari df awal
    "Topik_ID": topics,
    "Probabilitas": probabilities
})

# Untuk membuatnya lebih informatif, kita bisa gabungkan dengan kata kunci topiknya
# Menggabungkan (merge) dengan topic_info berdasarkan 'Topik_ID'
hasil_df = pd.merge(hasil_df, topic_info[['Topic', 'Name', 'Representation']], left_on='Topik_ID', right_on='Topic', how='left')
hasil_df = hasil_df.drop(columns=['Topic']) # Hapus kolom 'Topic' yang redundant

hasil_df.to_csv("hasil_topik_per_dokumen.csv", index=False)
print("File 'hasil_topik_per_dokumen.csv' berhasil disimpan.")
print("-" * 30)
print("Proses selesai!")

Tahap 1: Membaca dan memproses file data...
File 'data_berita_bersih.csv' berhasil dimuat.
Data siap. Jumlah dokumen: 925
------------------------------
Tahap 2: Melatih model Word2Vec (100 dimensi)...


2025-10-09 02:01:55,125 - BERTopic - Embedding - Transforming documents to embeddings.


Model Word2Vec selesai dilatih.
------------------------------
Tahap 3: Membuat wrapper embedding...
Wrapper embedding siap digunakan.
------------------------------
Tahap 4: Melatih model BERTopic...


Batches:   0%|          | 0/29 [00:00<?, ?it/s]

2025-10-09 02:02:16,753 - BERTopic - Embedding - Completed ✓
2025-10-09 02:02:16,754 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-09 02:02:17,617 - BERTopic - Dimensionality - Completed ✓
2025-10-09 02:02:17,618 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-09 02:02:17,661 - BERTopic - Cluster - Completed ✓
2025-10-09 02:02:17,665 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-09 02:02:17,817 - BERTopic - Representation - Completed ✓


Model BERTopic selesai dilatih!
------------------------------
Tahap 5: Menampilkan hasil topik...
Daftar Topik yang Ditemukan:
    Topic  Count                                Name  \
0      -1    334        -1_sehat_masyarakat_nu_jalan   
1       0     69  0_indonesia_presiden_tanah_menteri   
2       1     66       1_kota_diri_masyarakat_sampah   
3       2     62            2_diri_senyum_luka_ruang   
4       3     47       3_turnamen_tim_esport_tanding   
5       4     36             4_lagu_album_rilis_film   
6       5     32       5_laku_polisi_korban_sidoarjo   
7       6     28       6_motor_balap_listrik_kendara   
8       7     28           7_tafsir_wa_mustain_tuhan   
9       8     25                  8_rp_gram_harga_kg   
10      9     24               9_angin_ms_suhu_cuaca   
11     10     24              10_asep_kiai_kh_chalim   
12     11     22      11_tuban_bravo_pwi_supermarket   
13     12     17    12_santri_evakuasi_korban_runtuh   
14     13     16        13_digit