In [1]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("GPU available:", torch.cuda.is_available())

GPU available: False


In [3]:
df = pd.read_csv("../data/stemmed_merged_kubu_01.csv") 
print(df.columns)

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')


In [4]:
df['full_text'] = df['full_text'].str.replace(r'(wk)+|\bsih\b|\bya\b', '', regex=True)

In [5]:
docs = df['full_text'].dropna().astype(str).tolist()

In [6]:
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
embedding_model.to('cuda' if torch.cuda.is_available() else 'cpu')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [7]:
pca_model = PCA(n_components=25)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
)


In [8]:
topic_model_hdbscan = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

topics_hdbscan, probs = topic_model_hdbscan.fit_transform(docs)

topic_model_hdbscan.reduce_topics(docs, nr_topics=15)


2025-05-21 21:42:18,177 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 934/934 [05:24<00:00,  2.87it/s]
2025-05-21 21:47:43,587 - BERTopic - Embedding - Completed ✓
2025-05-21 21:47:43,588 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-21 21:48:11,663 - BERTopic - Dimensionality - Completed ✓
2025-05-21 21:48:11,665 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-21 21:49:53,170 - BERTopic - Cluster - Completed ✓
2025-05-21 21:49:53,180 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-21 21:49:53,629 - BERTopic - Representation - Completed ✓
2025-05-21 21:49:54,265 - BERTopic - Topic reduction - Reducing number of topics
2025-05-21 21:49:54,291 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-21 21:49:54,585 - BERTopic - Representation - Completed ✓
2025-05-21 21:49:54,592 - BERTopic - Topic reduction - Reduced 

<bertopic._bertopic.BERTopic at 0x14ad1c54650>

In [9]:
# Topic summary table
topic_info = topic_model_hdbscan.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_hdbscan.visualize_topics()
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_heatmap()


    Topic  Count                                  Name  \
0      -1   8829              -1_01_dukung_anies_pilih   
1       0  10232               0_01_dukung_calon_pilih   
2       1   4236             1_anies_cak_imin_baswedan   
3       2   2329  2_indonesia_presiden_anies_demokrasi   
4       3   1669         3_debat_calon_presiden_kritik   
5       4    844  4_politik_identitas_anies_legislatif   
6       5    554             5_video_videotron_film_01   
7       6    432         6_twitter_media_berita_sosial   
8       7    240               7_program_01_anies_misi   
9       8    152       8_palestina_israel_merdeka_bela   
10      9    138                9_data_fakta_01_dukung   
11     10     74              10_foto_cak_imin_blunder   
12     11     54          11_respons_reply_respon_imin   
13     12     53           12_viral_viralkan_ham_virus   
14     13     25  13_stadion_international_jis_stadium   

                                       Representation  \
0   [01, dukun

In [10]:
topic_info = topic_model_hdbscan.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8829,-1_01_dukung_anies_pilih,"[01, dukung, anies, pilih, calon, orang, imin,...",[anies calon gubernur calon presiden partai ad...
1,0,10232,0_01_dukung_calon_pilih,"[01, dukung, calon, pilih, pasang, orang, mena...",[pasang calon 01 sewa buzzer calon wakil presi...
2,1,4236,1_anies_cak_imin_baswedan,"[anies, cak, imin, baswedan, rasyid, amin, aya...","[tuju cak imin terima kasih cak amin, cak imin..."
3,2,2329,2_indonesia_presiden_anies_demokrasi,"[indonesia, presiden, anies, demokrasi, pilih,...",[februari besok pilih presiden pilih dewan wak...
4,3,1669,3_debat_calon_presiden_kritik,"[debat, calon, presiden, kritik, cak, imin, wa...",[bukti debat calon wakil presiden cak imin huj...


In [11]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_hdbscan.get_topic(tid)[:15])  # top-15 words



Topic -1
[('01', np.float64(0.052280113832356735)), ('dukung', np.float64(0.03211180515912758)), ('anies', np.float64(0.030680564167769084)), ('pilih', np.float64(0.028901577038669125)), ('calon', np.float64(0.021469823012414336)), ('orang', np.float64(0.02043306477852408)), ('imin', np.float64(0.020222380892350116)), ('presiden', np.float64(0.01981103158685654)), ('cak', np.float64(0.019704883532166994)), ('pasang', np.float64(0.01816345660737389))]

Topic 0
[('01', np.float64(0.07567067032173785)), ('dukung', np.float64(0.03852017289638799)), ('calon', np.float64(0.030950513921323183)), ('pilih', np.float64(0.029861926119970635)), ('pasang', np.float64(0.023622692987876618)), ('orang', np.float64(0.0235877515435893)), ('menang', np.float64(0.023146195789245823)), ('presiden', np.float64(0.021542752025550508)), ('banget', np.float64(0.018648455450808368)), ('kayak', np.float64(0.018154740768065326))]

Topic 1
[('anies', np.float64(0.07877485178182467)), ('cak', np.float64(0.078297061

In [12]:
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_topics()

In [13]:
# df['created_at'] = pd.to_datetime(df['created_at'])

In [14]:
# topics_over_time = topic_model_hdbscan.topics_over_time(
#     docs=docs,
#     topics=topics_hdbscan,
#     timestamps=df['created_at'],
#     nr_bins=20,  # Adjust to control time granularity
#     evolution_tuning=True,
#     global_tuning=True
# )

In [15]:
# topic_model_hdbscan.visualize_topics_over_time(
#     topics_over_time, 
#     top_n_topics=10  # Number of top topics to show
# )


In [16]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_hdbscan})
df_topics.to_csv("topic_assignments_hdbscan.csv", index=False)

# Save model
# topic_model.save("bertopic_model_hdbscan")


In [17]:
from sklearn.cluster import KMeans

num_topics = 15
kmeans_model = KMeans(n_clusters=num_topics, random_state=42)

topic_model_kmeans = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=kmeans_model,
    calculate_probabilities=False,
    verbose=True
)

topics_kmeans, _ = topic_model_kmeans.fit_transform(docs)

topic_model_kmeans.visualize_barchart(top_n_topics=15)

2025-05-21 21:50:01,932 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 934/934 [05:18<00:00,  2.93it/s]
2025-05-21 21:55:21,053 - BERTopic - Embedding - Completed ✓
2025-05-21 21:55:21,054 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-21 21:55:28,170 - BERTopic - Dimensionality - Completed ✓
2025-05-21 21:55:28,172 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-21 21:55:30,773 - BERTopic - Cluster - Completed ✓
2025-05-21 21:55:30,780 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-21 21:55:31,190 - BERTopic - Representation - Completed ✓


In [18]:
# Topic summary table
topic_info = topic_model_kmeans.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_kmeans.visualize_topics()
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_heatmap()


    Topic  Count                            Name  \
0       0   4330        0_01_dukung_pilih_buzzer   
1       1   4165        1_01_dukung_orang_pasang   
2       2   3900       2_cak_imin_anies_baswedan   
3       3   3250       3_anies_imin_cak_baswedan   
4       4   2960        4_01_dukung_pasang_orang   
5       5   2325    5_presiden_anies_pilih_calon   
6       6   1424       6_presiden_calon_wakil_01   
7       7   1146         7_menang_01_survei_gara   
8       8   1120        8_01_twitter_makan_media   
9       9   1073        9_suara_video_program_01   
10     10    970     10_indonesia_agama_01_islam   
11     11    875  11_politik_demokrasi_partai_01   
12     12    857       12_kayak_pintar_01_dukung   
13     13    760      13_debat_calon_01_presiden   
14     14    706      14_debat_cak_imin_presiden   

                                       Representation  \
0   [01, dukung, pilih, buzzer, amin, coblos, bang...   
1   [01, dukung, orang, pasang, pilih, banget, cal...

In [19]:
topic_info = topic_model_kmeans.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,4330,0_01_dukung_pilih_buzzer,"[01, dukung, pilih, buzzer, amin, coblos, bang...","[lucu dukung 01, dukung 01 lucu banget, anies ..."
1,1,4165,1_01_dukung_orang_pasang,"[01, dukung, orang, pasang, pilih, banget, cal...","[bang pilih kasih alas dukung 01, keren dukung..."
2,2,3900,2_cak_imin_anies_baswedan,"[cak, imin, anies, baswedan, rasyid, ayah, ami...","[belah cak imin, pilih pas umum cak imin ragu ..."
3,3,3250,3_anies_imin_cak_baswedan,"[anies, imin, cak, baswedan, indonesia, amin, ...",[ubah amin menang indonesia senang rakyat adil...
4,4,2960,4_01_dukung_pasang_orang,"[01, dukung, pasang, orang, kampanye, pilih, c...","[kampanye kampanye kerja putus pilih 01, meman..."


In [20]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_kmeans.get_topic(tid)[:15])  # top-15 words



Topic 0
[('01', np.float64(0.16693252876208642)), ('dukung', np.float64(0.05706158075075198)), ('pilih', np.float64(0.039916494444209666)), ('buzzer', np.float64(0.025134449835266703)), ('amin', np.float64(0.02195678841175883)), ('coblos', np.float64(0.02087835088200255)), ('banget', np.float64(0.018447717880339838)), ('bang', np.float64(0.01780265315584883)), ('kubu', np.float64(0.016412442935569087)), ('deh', np.float64(0.015741185545016155))]

Topic 1
[('01', np.float64(0.0895675764875032)), ('dukung', np.float64(0.055370526069649866)), ('orang', np.float64(0.03012416193764948)), ('pasang', np.float64(0.030044352288301563)), ('pilih', np.float64(0.029731186994926016)), ('banget', np.float64(0.02514265282736784)), ('calon', np.float64(0.021973702396372113)), ('buzzer', np.float64(0.021133814930640142)), ('amin', np.float64(0.017015222723623945)), ('bang', np.float64(0.016948261007963077))]

Topic 2
[('cak', np.float64(0.11076406868255162)), ('imin', np.float64(0.1090817193705964)), 

In [21]:
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_topics()

In [22]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_kmeans})
df_topics.to_csv("topic_assignments_kmeans.csv", index=False)

# Save model
# topic_model.save("bertopic_model_kmeans")

In [23]:
topics_over_time_kmeans = topic_model_kmeans.topics_over_time(
    docs=docs,
    topics=topics_hdbscan,
    timestamps=df['created_at'],
    nr_bins=20,
    evolution_tuning=True,
    global_tuning=True
)

topic_model_hdbscan.visualize_topics_over_time(topics_over_time_kmeans, top_n_topics=10)

0it [00:00, ?it/s]


IndexError: index (202) out of range

In [None]:
print("HDBSCAN Topics:", len(set(topics_hdbscan)) - (1 if -1 in topics_hdbscan else 0))
print("KMeans Topics:", len(set(topics_kmeans)))

# Outlier analysis
print("HDBSCAN Outliers:", topics_hdbscan.count(-1))  # KMeans won't have outliers

HDBSCAN Topics: 205
KMeans Topics: 15
HDBSCAN Outliers: 9996


In [None]:
import pandas as pd
from collections import Counter

# Count documents per topic
hdbscan_counts = Counter(topics_hdbscan)
kmeans_counts = Counter(topics_kmeans)

# Create DataFrame to compare
comparison_df = pd.DataFrame({
    "HDBSCAN": pd.Series(hdbscan_counts),
    "KMeans": pd.Series(kmeans_counts)
}).fillna(0).astype(int)

comparison_df.sort_index()


Unnamed: 0,HDBSCAN,KMeans
-1,9996,0
0,941,3525
1,903,3463
2,792,3304
3,789,2851
...,...,...
200,16,0
201,16,0
202,15,0
203,15,0
