# tugas 5 berita

In [1]:
import sys
!{sys.executable} -m pip install bertopic gensim numpy

^C


Collecting bertopic
  Using cached bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Using cached hdbscan-0.8.40-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Using cached umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Using cached sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting llvmlite>0.36.0 (from bertopic)
  Using cached llvmlite-0.45.1-cp310-cp310-win_amd64.whl.metadata (5.0 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers>=0.4.1->bertopic)
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers>=0.4.1->bertopic)
  Using cached torch-2.9.0-cp310-cp310-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers>=0.4.1->bertopic)
  Using cached huggingface_hub-0.35.3-py3-none-any.whl.

In [3]:
import pandas as pd
import ast
import gensim
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# ==============================================================================
# Tahap 1 - 4 (Sama seperti sebelumnya, tidak ada perubahan)
# ==============================================================================
print("Tahap 1: Membaca dan memproses file data...")
try:
    df = pd.read_csv('hasil_preprocessing_berita.csv')
    print("File berhasil dimuat.")
except FileNotFoundError:
    print("File tidak ditemukan")
    
df['tokens'] = df['hasil_preprocessing'].apply(ast.literal_eval)
tokenized_docs = df['tokens'].tolist()
print(f"Data siap. Jumlah dokumen: {len(tokenized_docs)}")
print("-" * 30)

print("Tahap 2: Melatih model Word2Vec (100 dimensi)...")
w2v_model = gensim.models.Word2Vec(
    sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4
)
print("Model Word2Vec selesai dilatih.")
print("-" * 30)

print("Tahap 3: Membuat wrapper embedding...")
class GensimEmbedding:
    def __init__(self, model):
        self.model = model
    def embed(self, documents):
        embeddings = []
        for doc_string in documents:
            doc_tokens = doc_string.split()
            word_vectors = [self.model.wv[word] for word in doc_tokens if word in self.model.wv]
            if not word_vectors:
                embeddings.append(np.zeros(self.model.vector_size))
            else:
                embeddings.append(np.mean(word_vectors, axis=0))
        return np.array(embeddings)
custom_embedding_model = GensimEmbedding(w2v_model)
print("Wrapper embedding siap digunakan.")
print("-" * 30)

print("Tahap 4: Melatih model BERTopic...")
vectorizer_model = CountVectorizer(stop_words=["dalam", "dan", "di", "untuk", "pada", "dengan", "yang", "itu", "ini"])
docs_for_bertopic = [" ".join(tokens) for tokens in tokenized_docs]
topic_model = BERTopic(
    embedding_model=custom_embedding_model,
    vectorizer_model=vectorizer_model,
    verbose=True
)
topics, probabilities = topic_model.fit_transform(docs_for_bertopic)
print("Model BERTopic selesai dilatih!")
print("-" * 30)

# ==============================================================================
# Tahap 5: Tampilkan Hasil (Tidak ada perubahan)
# ==============================================================================
print("Tahap 5: Menampilkan hasil topik...")
topic_info = topic_model.get_topic_info() # Simpan info topik ke variabel
print("Daftar Topik yang Ditemukan:")
print(topic_info)
print("-" * 30)

# ==============================================================================
# Tahap 6: Simpan Hasil ke File CSV 💾
# ==============================================================================
print("Tahap 6: Menyimpan hasil ke file CSV...")

# --- 1. Menyimpan Informasi Umum Setiap Topik ---
# File ini berisi daftar topik, jumlah dokumen per topik, dan kata kuncinya.
topic_info.to_csv("informasi_topik_berita.csv", index=False)
print("File 'informasi_topik_berita.csv' berhasil disimpan.")

# --- 2. Menyimpan Hasil Topik untuk Setiap Dokumen ---
# Kita buat DataFrame baru yang berisi dokumen asli dan hasil topiknya.
# `topics` adalah list nomor topik untuk setiap dokumen.
# `probabilities` adalah list probabilitas untuk setiap dokumen.
hasil_df = pd.DataFrame({
    "Dokumen": docs_for_bertopic, # Anda juga bisa menggunakan kolom asli dari df awal
    "Topik_ID": topics,
    "Probabilitas": probabilities
})

# Untuk membuatnya lebih informatif, kita bisa gabungkan dengan kata kunci topiknya
# Menggabungkan (merge) dengan topic_info berdasarkan 'Topik_ID'
hasil_df = pd.merge(hasil_df, topic_info[['Topic', 'Name', 'Representation']], left_on='Topik_ID', right_on='Topic', how='left')
hasil_df = hasil_df.drop(columns=['Topic']) # Hapus kolom 'Topic' yang redundant

hasil_df.to_csv("hasil_topik_per_dokumen_berita.csv", index=False)
print("File 'hasil_topik_per_dokumen_berita.csv' berhasil disimpan.")
print("-" * 30)
print("Proses selesai!")

Tahap 1: Membaca dan memproses file data...
File berhasil dimuat.
Data siap. Jumlah dokumen: 3653
------------------------------
Tahap 2: Melatih model Word2Vec (100 dimensi)...


2025-10-10 12:21:34,995 - BERTopic - Embedding - Transforming documents to embeddings.


Model Word2Vec selesai dilatih.
------------------------------
Tahap 3: Membuat wrapper embedding...
Wrapper embedding siap digunakan.
------------------------------
Tahap 4: Melatih model BERTopic...


Batches:   0%|          | 0/115 [00:00<?, ?it/s]

2025-10-10 12:23:12,899 - BERTopic - Embedding - Completed ✓
2025-10-10 12:23:12,900 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-10 12:23:28,762 - BERTopic - Dimensionality - Completed ✓
2025-10-10 12:23:28,763 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-10 12:23:28,869 - BERTopic - Cluster - Completed ✓
2025-10-10 12:23:28,872 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-10 12:23:29,347 - BERTopic - Representation - Completed ✓


Model BERTopic selesai dilatih!
------------------------------
Tahap 5: Menampilkan hasil topik...
Daftar Topik yang Ditemukan:
    Topic  Count                                Name  \
0      -1   1320  -1_indonesia_jalan_jawa_masyarakat   
1       0    178          0_islam_allah_istri_masjid   
2       1    154         1_polisi_polres_aman_sangka   
3       2     88            2_bahan_ayam_daun_bawang   
4       3     69            3_digital_media_ai_layan   
..    ...    ...                                 ...   
71     70     12       70_asuransi_mobil_tlo_kendara   
72     71     12    71_kucing_maggot_wayang_gandrung   
73     72     11      72_stunting_muslimat_nu_miskin   
74     73     11               73_kec_tanah_kan_desa   
75     74     10        74_rias_lomba_serta_kelereng   

                                       Representation  \
0   [indonesia, jalan, jawa, masyarakat, timur, di...   
1   [islam, allah, istri, masjid, suami, perempuan...   
2   [polisi, polres, aman, s