In [17]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [18]:
print("GPU available:", torch.cuda.is_available())

GPU available: False


In [19]:
df = pd.read_csv("../data/stemmed_merged_kubu_01.csv") 
print(df.columns)

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')


In [20]:
df['full_text'] = df['full_text'].str.replace(r'(wk)+|\bsih\b', '', regex=True)

In [21]:
docs = df['full_text'].dropna().astype(str).tolist()

In [22]:
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
embedding_model.to('cuda' if torch.cuda.is_available() else 'cpu')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [23]:
pca_model = PCA(n_components=25)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
)


In [28]:
topic_model_hdbscan = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

topics_hdbscan, probs = topic_model_hdbscan.fit_transform(docs)

topic_model_hdbscan.reduce_topics(docs, nr_topics=30)


2025-05-20 18:45:42,901 - BERTopic - Embedding - Transforming documents to embeddings.


Batches: 100%|██████████| 934/934 [05:56<00:00,  2.62it/s]
2025-05-20 18:51:39,219 - BERTopic - Embedding - Completed ✓
2025-05-20 18:51:39,220 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-20 18:51:46,793 - BERTopic - Dimensionality - Completed ✓
2025-05-20 18:51:46,796 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-20 18:53:28,179 - BERTopic - Cluster - Completed ✓
2025-05-20 18:53:28,189 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-20 18:53:28,774 - BERTopic - Representation - Completed ✓
2025-05-20 18:53:29,563 - BERTopic - Topic reduction - Reducing number of topics
2025-05-20 18:53:29,593 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-20 18:53:29,938 - BERTopic - Representation - Completed ✓
2025-05-20 18:53:29,943 - BERTopic - Topic reduction - Reduced number of topics from 201 to 30


<bertopic._bertopic.BERTopic at 0x19d26e9e510>

In [29]:
# Topic summary table
topic_info = topic_model_hdbscan.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_hdbscan.visualize_topics()
topic_model_hdbscan.visualize_barchart(top_n_topics=10)
topic_model_hdbscan.visualize_heatmap()


    Topic  Count                                            Name  \
0      -1   8850                        -1_01_dukung_anies_pilih   
1       0   7897                         0_01_dukung_orang_pilih   
2       1   4350                       1_cak_imin_anies_baswedan   
3       2   2041                2_presiden_anies_indonesia_pilih   
4       3   1383                   3_presiden_calon_indonesia_01   
5       4   1348             4_politik_demokrasi_partai_demokrat   
6       5   1302                    5_debat_presiden_calon_wakil   
7       6    567                    6_video_videotron_film_lihat   
8       7    314                  7_kritik_komentar_opini_dukung   
9       8    305                      8_twitter_berita_01_tiktok   
10      9    231                       9_program_misi_visi_kerja   
11     10    211                       10_pintar_orang_dukung_01   
12     11    149                11_palestina_israel_merdeka_aksi   
13     12    141                         12_kaya

In [30]:
topic_info = topic_model_hdbscan.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8850,-1_01_dukung_anies_pilih,"[01, dukung, anies, pilih, pasang, calon, amin...",[orang pikir waras pilih pasang calon 01 anies...
1,0,7897,0_01_dukung_orang_pilih,"[01, dukung, orang, pilih, kayak, menang, pasa...","[pasang calon 01 awal suara kayak, orang cerda..."
2,1,4350,1_cak_imin_anies_baswedan,"[cak, imin, anies, baswedan, rasyid, amin, aya...",[tutup kampanye jis cak imin menang sambut pel...
3,2,2041,2_presiden_anies_indonesia_pilih,"[presiden, anies, indonesia, pilih, baswedan, ...",[pilih presiden selesai presiden anies rasyid ...
4,3,1383,3_presiden_calon_indonesia_01,"[presiden, calon, indonesia, 01, wakil, pilih,...","[pilih pasang calon 01 calon presiden doang , ..."


In [31]:
for tid in topic_info['Topic'][:10]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_hdbscan.get_topic(tid)[:15])  # top-15 words



Topic -1
[('01', np.float64(0.033769738953936826)), ('dukung', np.float64(0.023431180737116898)), ('anies', np.float64(0.017041645121952026)), ('pilih', np.float64(0.016082903248091775)), ('pasang', np.float64(0.014483684313922142)), ('calon', np.float64(0.013865910187683232)), ('amin', np.float64(0.013206987635296596)), ('imin', np.float64(0.013178019222805513)), ('orang', np.float64(0.012978840935215657)), ('banget', np.float64(0.012872053105648712))]

Topic 0
[('01', np.float64(0.0490007585834405)), ('dukung', np.float64(0.026941556660253677)), ('orang', np.float64(0.020828194056245843)), ('pilih', np.float64(0.02056598326767903)), ('kayak', np.float64(0.020138182561019864)), ('menang', np.float64(0.017681476923224172)), ('pasang', np.float64(0.017467365055288835)), ('suara', np.float64(0.016226352416607345)), ('banget', np.float64(0.01395728994742787)), ('calon', np.float64(0.012562618820492123))]

Topic 1
[('cak', np.float64(0.05253600116134086)), ('imin', np.float64(0.0521380819

In [32]:
topic_model_hdbscan.visualize_barchart(top_n_topics=30)
topic_model_hdbscan.visualize_topics()

In [33]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_hdbscan})
df_topics.to_csv("topic_assignments_hdbscan.csv", index=False)

# Save model
# topic_model.save("bertopic_model_hdbscan")


In [34]:
from sklearn.cluster import KMeans

num_topics = 30
kmeans_model = KMeans(n_clusters=num_topics, random_state=42)

topic_model_kmeans = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=kmeans_model,
    calculate_probabilities=False,
    verbose=True
)

topics_kmeans, _ = topic_model_kmeans.fit_transform(docs)

topic_model_kmeans.visualize_barchart(top_n_topics=30)

2025-05-20 18:53:30,648 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 934/934 [05:39<00:00,  2.75it/s]
2025-05-20 18:59:09,936 - BERTopic - Embedding - Completed ✓
2025-05-20 18:59:09,937 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-20 18:59:16,860 - BERTopic - Dimensionality - Completed ✓
2025-05-20 18:59:16,863 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-20 18:59:16,963 - BERTopic - Cluster - Completed ✓
2025-05-20 18:59:16,970 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-20 18:59:17,405 - BERTopic - Representation - Completed ✓


In [35]:
# Topic summary table
topic_info = topic_model_kmeans.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_kmeans.visualize_topics()
topic_model_kmeans.visualize_barchart(top_n_topics=10)
topic_model_kmeans.visualize_heatmap()


    Topic  Count                                  Name  \
0       0   2415              0_01_dukung_pilih_buzzer   
1       1   2115          1_anies_baswedan_rasyid_imin   
2       2   2107             2_01_dukung_pasang_buzzer   
3       3   1970             3_anies_baswedan_imin_cak   
4       4   1934               4_01_dukung_orang_teman   
5       5   1735               5_01_orang_dukung_pilih   
6       6   1689          6_presiden_pilih_anies_calon   
7       7   1592                 7_01_line_dukung_grup   
8       8   1551               8_cak_imin_anies_banget   
9       9   1192               9_menang_survei_01_gara   
10     10   1023             10_kampanye_kerja_01_data   
11     11    798         11_presiden_calon_wakil_pilih   
12     12    794        12_twitter_media_kritik_sosial   
13     13    767               13_cak_imin_kayak_anies   
14     14    761            14_debat_calon_01_strategi   
15     15    749    15_indonesia_anies_baswedan_makmur   
16     16    7

In [36]:
topic_info = topic_model_kmeans.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,2415,0_01_dukung_pilih_buzzer,"[01, dukung, pilih, buzzer, coblos, kubu, amin...","[alhamdulillah banget dukung 01 serang, bawa ..."
1,1,2115,1_anies_baswedan_rasyid_imin,"[anies, baswedan, rasyid, imin, cak, ayah, ami...","[lupa dukung ayah anies rasyid baswedan mana, ..."
2,2,2107,2_01_dukung_pasang_buzzer,"[01, dukung, pasang, buzzer, banget, bang, sal...","[dukung 01 cari salah, hasil tarik dukung pasa..."
3,3,1970,3_anies_baswedan_imin_cak,"[anies, baswedan, imin, cak, rasyid, amin, kam...",[anies rasyid baswedan berani mantap banget an...
4,4,1934,4_01_dukung_orang_teman,"[01, dukung, orang, teman, pasang, pilih, calo...",[muak dukung 01 aneh dukung pasang calon kena ...


In [37]:
for tid in topic_info['Topic'][:10]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_kmeans.get_topic(tid)[:15])  # top-15 words



Topic 0
[('01', np.float64(0.08859477740870374)), ('dukung', np.float64(0.040829760668721835)), ('pilih', np.float64(0.039840589590172525)), ('buzzer', np.float64(0.026065499717887713)), ('coblos', np.float64(0.02250656713954992)), ('kubu', np.float64(0.021475226436417617)), ('amin', np.float64(0.01868922014666655)), ('banget', np.float64(0.017590316775865533)), ('bang', np.float64(0.016965965028355484)), ('all', np.float64(0.015148760090962332))]

Topic 1
[('anies', np.float64(0.06333527283920057)), ('baswedan', np.float64(0.05549028503222567)), ('rasyid', np.float64(0.0448971324756731)), ('imin', np.float64(0.04304652759100552)), ('cak', np.float64(0.04246926226151545)), ('ayah', np.float64(0.028705829862894824)), ('amin', np.float64(0.017879642902794878)), ('sambut', np.float64(0.014069924788707344)), ('muhaimin', np.float64(0.013505356311437932)), ('anak', np.float64(0.01347297779697965))]

Topic 2
[('01', np.float64(0.06128277482908749)), ('dukung', np.float64(0.04255688806894271

In [38]:
topic_model_kmeans.visualize_barchart(top_n_topics=30)
topic_model_kmeans.visualize_topics()

In [39]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_kmeans})
df_topics.to_csv("topic_assignments_kmeans.csv", index=False)

# Save model
# topic_model.save("bertopic_model_kmeans")

In [40]:
print("HDBSCAN Topics:", len(set(topics_hdbscan)) - (1 if -1 in topics_hdbscan else 0))
print("KMeans Topics:", len(set(topics_kmeans)))

# Outlier analysis
print("HDBSCAN Outliers:", topics_hdbscan.count(-1))  # KMeans won't have outliers

HDBSCAN Topics: 200
KMeans Topics: 30
HDBSCAN Outliers: 8850


In [41]:
import pandas as pd
from collections import Counter

# Count documents per topic
hdbscan_counts = Counter(topics_hdbscan)
kmeans_counts = Counter(topics_kmeans)

# Create DataFrame to compare
comparison_df = pd.DataFrame({
    "HDBSCAN": pd.Series(hdbscan_counts),
    "KMeans": pd.Series(kmeans_counts)
}).fillna(0).astype(int)

comparison_df.sort_index()


Unnamed: 0,HDBSCAN,KMeans
-1,8850,0
0,3175,2415
1,1335,2115
2,922,2107
3,561,1970
...,...,...
195,16,0
196,16,0
197,16,0
198,15,0


In [42]:
topic_model_hdbscan.get_topic(3)
df[topics_hdbscan == 3].sample(3) 

topic_model_kmeans.get_topic(3)
df[topics_kmeans == 3].sample(3)


KeyError: False