In [1]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [2]:
print("GPU available:", torch.cuda.is_available())

GPU available: True


In [3]:
df = pd.read_csv("../data/stemmed_merged_kubu_01.csv") 
print(df.columns)

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')


In [4]:
df['full_text'] = df['full_text'].str.replace(r'(wk)+|\bsih\b|\bya\b', '', regex=True)

In [5]:
docs = df['full_text'].dropna().astype(str).tolist()

In [6]:
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
embedding_model.to('cuda' if torch.cuda.is_available() else 'cpu')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [7]:
pca_model = PCA(n_components=25)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
)


In [8]:
topic_model_hdbscan = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

topics_hdbscan, probs = topic_model_hdbscan.fit_transform(docs)

topic_model_hdbscan.reduce_topics(docs, nr_topics=15)


2025-05-22 10:15:19,555 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/934 [00:00<?, ?it/s]

2025-05-22 10:15:24,756 - BERTopic - Embedding - Completed ✓
2025-05-22 10:15:24,756 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 10:15:39,738 - BERTopic - Dimensionality - Completed ✓
2025-05-22 10:15:39,740 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 10:16:34,284 - BERTopic - Cluster - Completed ✓
2025-05-22 10:16:34,289 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 10:16:34,516 - BERTopic - Representation - Completed ✓
2025-05-22 10:16:34,842 - BERTopic - Topic reduction - Reducing number of topics
2025-05-22 10:16:34,856 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 10:16:34,982 - BERTopic - Representation - Completed ✓
2025-05-22 10:16:34,984 - BERTopic - Topic reduction - Reduced number of topics from 206 to 15


<bertopic._bertopic.BERTopic at 0x20819f24700>

In [9]:
# Topic summary table
topic_info = topic_model_hdbscan.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_hdbscan.visualize_topics()
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_heatmap()


    Topic  Count                                 Name  \
0      -1   9870              -1_01_anies_dukung_imin   
1       0  10382              0_01_dukung_calon_pilih   
2       1   2979  1_anies_baswedan_indonesia_presiden   
3       2   1680        2_debat_calon_presiden_kritik   
4       3   1544  3_politik_partai_demokrasi_demokrat   
5       4   1514               4_cak_imin_anies_kayak   
6       5    559            5_video_videotron_01_film   
7       6    433        6_twitter_media_berita_sosial   
8       7    358                7_pop_korea_02_konsep   
9       8    233              8_program_anies_01_misi   
10      9    152      9_palestina_israel_merdeka_bela   
11     10     52          10_respons_reply_respon_cak   
12     11     51          11_viral_viralkan_ham_virus   
13     12     33         12_data_luhut_cherry_tambang   
14     13     21             13_estate_food_tani_blak   

                                       Representation  \
0   [01, anies, dukung, imin, 

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [10]:
topic_info = topic_model_hdbscan.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9870,-1_01_anies_dukung_imin,"[01, anies, dukung, imin, cak, pilih, calon, o...",[presiden terima kasih tingkat standar kampany...
1,0,10382,0_01_dukung_calon_pilih,"[01, dukung, calon, pilih, pasang, orang, mena...",[asli haru banget rakyat indonesia jalan jis d...
2,1,2979,1_anies_baswedan_indonesia_presiden,"[anies, baswedan, indonesia, presiden, rasyid,...",[semangat indonesia milik pimpin rakyat anies ...
3,2,1680,2_debat_calon_presiden_kritik,"[debat, calon, presiden, kritik, cak, imin, an...",[bukti debat calon wakil presiden cak imin huj...
4,3,1544,3_politik_partai_demokrasi_demokrat,"[politik, partai, demokrasi, demokrat, anies, ...",[ketua partai pegang partai demokrasi indonesi...


In [11]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_hdbscan.get_topic(tid)[:15])  # top-15 words



Topic -1
[('01', np.float64(0.044657594563075206)), ('anies', np.float64(0.03835662477102073)), ('dukung', np.float64(0.030574327089240915)), ('imin', np.float64(0.03004313859230322)), ('cak', np.float64(0.02951789471657971)), ('pilih', np.float64(0.02753467247489603)), ('calon', np.float64(0.019261171367582337)), ('orang', np.float64(0.01917101292149584)), ('amin', np.float64(0.018896133234297143)), ('presiden', np.float64(0.018584025074659095))]

Topic 0
[('01', np.float64(0.07526216246939593)), ('dukung', np.float64(0.03685685200245356)), ('calon', np.float64(0.03114196267161134)), ('pilih', np.float64(0.030023127656557364)), ('pasang', np.float64(0.023760180007082594)), ('orang', np.float64(0.023725092252510106)), ('menang', np.float64(0.023707664677412584)), ('presiden', np.float64(0.0216531265469141)), ('kayak', np.float64(0.01854468703828625)), ('banget', np.float64(0.017949223351798608))]

Topic 1
[('anies', np.float64(0.07736698961806611)), ('baswedan', np.float64(0.051277017

In [12]:
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [13]:
# df['created_at'] = pd.to_datetime(df['created_at'])

In [14]:
# topics_over_time = topic_model_hdbscan.topics_over_time(
#     docs=docs,
#     topics=topics_hdbscan,
#     timestamps=df['created_at'],
#     nr_bins=20,  # Adjust to control time granularity
#     evolution_tuning=True,
#     global_tuning=True
# )

In [15]:
# topic_model_hdbscan.visualize_topics_over_time(
#     topics_over_time, 
#     top_n_topics=10  # Number of top topics to show
# )


In [16]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_hdbscan})
df_topics.to_csv("topic_assignments_hdbscan.csv", index=False)

# Save model
# topic_model.save("bertopic_model_hdbscan")


In [13]:
from sklearn.cluster import KMeans

num_topics = 15
kmeans_model = KMeans(n_clusters=num_topics, random_state=42)

topic_model_kmeans = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=kmeans_model,
    calculate_probabilities=False,
    verbose=True
)

topics_kmeans, _ = topic_model_kmeans.fit_transform(docs)

topic_model_kmeans.visualize_barchart(top_n_topics=15)

2025-05-22 12:04:33,346 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/934 [00:00<?, ?it/s]

2025-05-22 12:04:38,509 - BERTopic - Embedding - Completed ✓
2025-05-22 12:04:38,510 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 12:04:40,784 - BERTopic - Dimensionality - Completed ✓
2025-05-22 12:04:40,786 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 12:04:40,939 - BERTopic - Cluster - Completed ✓
2025-05-22 12:04:40,945 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 12:04:41,114 - BERTopic - Representation - Completed ✓


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [14]:
# Topic summary table
topic_info = topic_model_kmeans.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_kmeans.visualize_topics()
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_heatmap()


    Topic  Count                            Name  \
0       0   3249         0_01_dukung_orang_pilih   
1       1   3151        1_01_dukung_pilih_pasang   
2       2   2856       2_anies_baswedan_imin_cak   
3       3   2752          3_01_dukung_pilih_amin   
4       4   2696      4_01_dukung_kampanye_orang   
5       5   2321    5_presiden_anies_pilih_calon   
6       6   2203      6_anies_indonesia_imin_cak   
7       7   2084         7_cak_imin_anies_banget   
8       8   1801         8_kayak_01_video_dukung   
9       9   1650     9_suara_indonesia_01_pintar   
10     10   1555      10_presiden_calon_01_wakil   
11     11   1199        11_menang_01_survei_gara   
12     12    885  12_politik_demokrasi_partai_01   
13     13    748      13_debat_calon_01_presiden   
14     14    711      14_debat_cak_imin_presiden   

                                       Representation  \
0   [01, dukung, orang, pilih, pasang, banget, tem...   
1   [01, dukung, pilih, pasang, buzzer, banget, ca...

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [15]:
topic_info = topic_model_kmeans.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,3249,0_01_dukung_orang_pilih,"[01, dukung, orang, pilih, pasang, banget, tem...","[orang malu pilih 01, orang pilih 01 dukung am..."
1,1,3151,1_01_dukung_pilih_pasang,"[01, dukung, pilih, pasang, buzzer, banget, ca...","[sopan banget dukung amin dukung 01, kasihan p..."
2,2,2856,2_anies_baswedan_imin_cak,"[anies, baswedan, imin, cak, rasyid, ayah, ami...",[tim nasional timnas menang anies baswedan muh...
3,3,2752,3_01_dukung_pilih_amin,"[01, dukung, pilih, amin, bang, buzzer, coblos...","[ dukung 01 harap, asli dukung 01 amin, dukung..."
4,4,2696,4_01_dukung_kampanye_orang,"[01, dukung, kampanye, orang, pasang, pilih, k...",[ribut pasang calon a b c omong bicara data co...


In [16]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_kmeans.get_topic(tid)[:15])  # top-15 words



Topic 0
[('01', np.float64(0.0771922877941425)), ('dukung', np.float64(0.05080941147309609)), ('orang', np.float64(0.037948476104567454)), ('pilih', np.float64(0.029205198271782782)), ('pasang', np.float64(0.02891636204398661)), ('banget', np.float64(0.024384181851401388)), ('teman', np.float64(0.02398189712478682)), ('calon', np.float64(0.021505741597132855)), ('buzzer', np.float64(0.01612408369899807)), ('bilang', np.float64(0.01574043530658533))]

Topic 1
[('01', np.float64(0.11554277495623003)), ('dukung', np.float64(0.06138137985566899)), ('pilih', np.float64(0.035872245111182206)), ('pasang', np.float64(0.030602561745866348)), ('buzzer', np.float64(0.027421154467304733)), ('banget', np.float64(0.023402408122674034)), ('calon', np.float64(0.02219665300763565)), ('bang', np.float64(0.019595020859839805)), ('amin', np.float64(0.01850916308741227)), ('kubu', np.float64(0.017474347201589323))]

Topic 2
[('anies', np.float64(0.07949788701704243)), ('baswedan', np.float64(0.05648703278

In [17]:
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [18]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_kmeans})
df_topics.to_csv("topic_assignments_kmeans.csv", index=False)

# Save model
# topic_model.save("bertopic_model_kmeans")

In [19]:
topics_over_time_kmeans = topic_model_kmeans.topics_over_time(
    docs=docs,
    topics=topics_hdbscan,
    timestamps=df['created_at'],
    nr_bins=20,
    evolution_tuning=True,
    global_tuning=True
)

topic_model_hdbscan.visualize_topics_over_time(topics_over_time_kmeans, top_n_topics=10)

0it [00:00, ?it/s]


IndexError: index (184) out of range

In [20]:
print("HDBSCAN Topics:", len(set(topics_hdbscan)) - (1 if -1 in topics_hdbscan else 0))
print("KMeans Topics:", len(set(topics_kmeans)))

# Outlier analysis
print("HDBSCAN Outliers:", topics_hdbscan.count(-1))  # KMeans won't have outliers

HDBSCAN Topics: 205
KMeans Topics: 15
HDBSCAN Outliers: 9870


In [21]:
import pandas as pd
from collections import Counter

# Count documents per topic
hdbscan_counts = Counter(topics_hdbscan)
kmeans_counts = Counter(topics_kmeans)

# Create DataFrame to compare
comparison_df = pd.DataFrame({
    "HDBSCAN": pd.Series(hdbscan_counts),
    "KMeans": pd.Series(kmeans_counts)
}).fillna(0).astype(int)

comparison_df.sort_index()


Unnamed: 0,HDBSCAN,KMeans
-1,9870,0
0,1015,3249
1,776,3151
2,568,2856
3,545,2752
...,...,...
200,16,0
201,16,0
202,16,0
203,15,0
