In [51]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [52]:
print("GPU available:", torch.cuda.is_available())

GPU available: False


In [53]:
df = pd.read_csv("../data/stemmed_merged_kubu_01.csv") 
print(df.columns)

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')


In [54]:
df['full_text'] = df['full_text'].str.replace(r'(wk)+|\bsih\b|\bya\b', '', regex=True)

In [55]:
docs = df['full_text'].dropna().astype(str).tolist()

In [56]:
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
embedding_model.to('cuda' if torch.cuda.is_available() else 'cpu')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [57]:
pca_model = PCA(n_components=25)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
)


In [58]:
topic_model_hdbscan = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

topics_hdbscan, probs = topic_model_hdbscan.fit_transform(docs)

topic_model_hdbscan.reduce_topics(docs, nr_topics=15)


2025-05-20 19:29:34,767 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 934/934 [04:54<00:00,  3.17it/s]
2025-05-20 19:34:29,624 - BERTopic - Embedding - Completed ✓
2025-05-20 19:34:29,625 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-20 19:34:35,696 - BERTopic - Dimensionality - Completed ✓
2025-05-20 19:34:35,698 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-20 19:36:17,745 - BERTopic - Cluster - Completed ✓
2025-05-20 19:36:17,753 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-20 19:36:18,214 - BERTopic - Representation - Completed ✓
2025-05-20 19:36:18,844 - BERTopic - Topic reduction - Reducing number of topics
2025-05-20 19:36:18,871 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-20 19:36:19,187 - BERTopic - Representation - Completed ✓
2025-05-20 19:36:19,191 - BERTopic - Topic reduction - Reduced 

<bertopic._bertopic.BERTopic at 0x19d26d093a0>

In [59]:
# Topic summary table
topic_info = topic_model_hdbscan.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_hdbscan.visualize_topics()
topic_model_hdbscan.visualize_barchart(top_n_topics=10)
topic_model_hdbscan.visualize_heatmap()


    Topic  Count                                 Name  \
0      -1   9996                 -1_anies_01_imin_cak   
1       0  10645              0_01_dukung_calon_pilih   
2       1   3530  1_anies_presiden_baswedan_indonesia   
3       2   1620        2_debat_calon_presiden_kritik   
4       3   1447  3_politik_partai_demokrasi_demokrat   
5       4    835              4_cak_imin_anies_banget   
6       5    647          5_video_videotron_pop_korea   
7       6    418        6_twitter_berita_media_sosial   
8       7    223              7_program_anies_01_misi   
9       8    206             8_pintar_orang_dukung_01   
10      9    152      9_palestina_israel_merdeka_bela   
11     10     51          10_viral_viralkan_ham_virus   
12     11     51          11_respons_reply_respon_cak   
13     12     21             12_estate_food_tani_blak   
14     13     19              13_store_unduh_app_play   

                                       Representation  \
0   [anies, 01, imin, cak, duk

In [60]:
topic_info = topic_model_hdbscan.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9996,-1_anies_01_imin_cak,"[anies, 01, imin, cak, dukung, pilih, baswedan...","[dukung 01 bilang pilih rakyat daerah, bismill..."
1,0,10645,0_01_dukung_calon_pilih,"[01, dukung, calon, pilih, pasang, menang, ora...","[kayak kalah dukung 01 suka gagas, dukung anie..."
2,1,3530,1_anies_presiden_baswedan_indonesia,"[anies, presiden, baswedan, indonesia, imin, p...",[anies pimpin sejati presiden republik indones...
3,2,1620,2_debat_calon_presiden_kritik,"[debat, calon, presiden, kritik, imin, cak, wa...",[pasang calon 01 debat calon wakil presiden hi...
4,3,1447,3_politik_partai_demokrasi_demokrat,"[politik, partai, demokrasi, demokrat, pilih, ...",[demokrat ngototnya aneh suara kalah nasional ...


In [61]:
for tid in topic_info['Topic'][:10]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_hdbscan.get_topic(tid)[:15])  # top-15 words



Topic -1
[('anies', np.float64(0.042107161561196926)), ('01', np.float64(0.041770042554131095)), ('imin', np.float64(0.030162217323258876)), ('cak', np.float64(0.029684032584952957)), ('dukung', np.float64(0.027848555022341407)), ('pilih', np.float64(0.02440429314392522)), ('baswedan', np.float64(0.019908539497583224)), ('amin', np.float64(0.019509279635159298)), ('orang', np.float64(0.018259422299041924)), ('calon', np.float64(0.01695186311184994))]

Topic 0
[('01', np.float64(0.07678779183281045)), ('dukung', np.float64(0.03896516492098757)), ('calon', np.float64(0.030278729802373668)), ('pilih', np.float64(0.029235719999989092)), ('pasang', np.float64(0.023750555335709158)), ('menang', np.float64(0.023401264750183767)), ('orang', np.float64(0.022922933559561102)), ('presiden', np.float64(0.020065946232007865)), ('kayak', np.float64(0.01976185326696861)), ('banget', np.float64(0.018154983656687423))]

Topic 1
[('anies', np.float64(0.07513222808046073)), ('presiden', np.float64(0.054

In [62]:
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_topics()

In [63]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_hdbscan})
df_topics.to_csv("topic_assignments_hdbscan.csv", index=False)

# Save model
# topic_model.save("bertopic_model_hdbscan")


In [64]:
from sklearn.cluster import KMeans

num_topics = 15
kmeans_model = KMeans(n_clusters=num_topics, random_state=42)

topic_model_kmeans = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=kmeans_model,
    calculate_probabilities=False,
    verbose=True
)

topics_kmeans, _ = topic_model_kmeans.fit_transform(docs)

topic_model_kmeans.visualize_barchart(top_n_topics=15)

2025-05-20 19:36:19,776 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 934/934 [04:49<00:00,  3.23it/s]
2025-05-20 19:41:09,173 - BERTopic - Embedding - Completed ✓
2025-05-20 19:41:09,174 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-20 19:41:14,917 - BERTopic - Dimensionality - Completed ✓
2025-05-20 19:41:14,920 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-20 19:41:14,978 - BERTopic - Cluster - Completed ✓
2025-05-20 19:41:14,986 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-20 19:41:15,365 - BERTopic - Representation - Completed ✓


In [65]:
# Topic summary table
topic_info = topic_model_kmeans.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_kmeans.visualize_topics()
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_heatmap()


    Topic  Count                            Name  \
0       0   3525          0_01_dukung_pilih_amin   
1       1   3463         1_01_dukung_orang_pilih   
2       2   3304        2_01_dukung_pasang_pilih   
3       3   2851       3_anies_baswedan_imin_cak   
4       4   2314    4_presiden_anies_pilih_calon   
5       5   2271      5_01_kampanye_dukung_kerja   
6       6   2269      6_anies_indonesia_imin_cak   
7       7   2033         7_cak_imin_anies_banget   
8       8   1730      8_kayak_suara_01_indonesia   
9       9   1416       9_presiden_calon_wakil_01   
10     10   1286        10_menang_01_survei_gara   
11     11   1056     11_video_program_01_twitter   
12     12    871  12_politik_demokrasi_partai_01   
13     13    762      13_debat_calon_01_presiden   
14     14    710      14_debat_cak_imin_presiden   

                                       Representation  \
0   [01, dukung, pilih, amin, buzzer, coblos, kubu...   
1   [01, dukung, orang, pilih, pasang, calon, bang...

In [66]:
topic_info = topic_model_kmeans.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,3525,0_01_dukung_pilih_amin,"[01, dukung, pilih, amin, buzzer, coblos, kubu...","[lucu dukung 01, pas pilih amin 01, coblos 01 ..."
1,1,3463,1_01_dukung_orang_pilih,"[01, dukung, orang, pilih, pasang, calon, bang...",[wajib alas pilih pasang calon deh dukung pasa...
2,2,3304,2_01_dukung_pasang_pilih,"[01, dukung, pasang, pilih, banget, buzzer, ca...","[dukung 01 banyak korban buzzer deh, komen duk..."
3,3,2851,3_anies_baswedan_imin_cak,"[anies, baswedan, imin, cak, rasyid, ayah, ami...",[anak sebut lengkap anies rasyid baswedan tema...
4,4,2314,4_presiden_anies_pilih_calon,"[presiden, anies, pilih, calon, baswedan, waki...",[cak imin sebut santri calon presiden calon wa...


In [67]:
for tid in topic_info['Topic'][:10]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_kmeans.get_topic(tid)[:15])  # top-15 words



Topic 0
[('01', np.float64(0.17999234753117513)), ('dukung', np.float64(0.05316047423135683)), ('pilih', np.float64(0.038002265207892454)), ('amin', np.float64(0.02456071220150472)), ('buzzer', np.float64(0.02219320211689753)), ('coblos', np.float64(0.02117551507583413)), ('kubu', np.float64(0.0188911609046937)), ('bang', np.float64(0.018724551401291385)), ('banget', np.float64(0.01695381068698922)), ('all', np.float64(0.015265770499369108))]

Topic 1
[('01', np.float64(0.06116502636690249)), ('dukung', np.float64(0.042540085387303604)), ('orang', np.float64(0.04154855264352592)), ('pilih', np.float64(0.03055361364644349)), ('pasang', np.float64(0.028660519732379448)), ('calon', np.float64(0.0212359306588387)), ('banget', np.float64(0.019680014415701514)), ('teman', np.float64(0.019485637074210575)), ('bilang', np.float64(0.01640976637701461)), ('anies', np.float64(0.014695961789340933))]

Topic 2
[('01', np.float64(0.1029656144038641)), ('dukung', np.float64(0.06065905724951994)), ('

In [68]:
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_topics()

In [69]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_kmeans})
df_topics.to_csv("topic_assignments_kmeans.csv", index=False)

# Save model
# topic_model.save("bertopic_model_kmeans")

In [70]:
print("HDBSCAN Topics:", len(set(topics_hdbscan)) - (1 if -1 in topics_hdbscan else 0))
print("KMeans Topics:", len(set(topics_kmeans)))

# Outlier analysis
print("HDBSCAN Outliers:", topics_hdbscan.count(-1))  # KMeans won't have outliers

HDBSCAN Topics: 205
KMeans Topics: 15
HDBSCAN Outliers: 9996


In [71]:
import pandas as pd
from collections import Counter

# Count documents per topic
hdbscan_counts = Counter(topics_hdbscan)
kmeans_counts = Counter(topics_kmeans)

# Create DataFrame to compare
comparison_df = pd.DataFrame({
    "HDBSCAN": pd.Series(hdbscan_counts),
    "KMeans": pd.Series(kmeans_counts)
}).fillna(0).astype(int)

comparison_df.sort_index()


Unnamed: 0,HDBSCAN,KMeans
-1,9996,0
0,941,3525
1,903,3463
2,792,3304
3,789,2851
...,...,...
200,16,0
201,16,0
202,15,0
203,15,0
