In [1]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [2]:
print("GPU available:", torch.cuda.is_available())

GPU available: True


In [3]:
df = pd.read_csv("../data/stemmed_merged_kubu_02.csv") 
print(df.columns)

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')


In [4]:
df['full_text'] = df['full_text'].str.replace(r'(wk)+|\bsih\b|\bya\b', '', regex=True)

In [5]:
docs = df['full_text'].dropna().astype(str).tolist()

In [6]:
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
embedding_model.to('cuda' if torch.cuda.is_available() else 'cpu')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [7]:
pca_model = PCA(n_components=25)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
)


In [8]:
topic_model_hdbscan = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

topics_hdbscan, probs = topic_model_hdbscan.fit_transform(docs)

topic_model_hdbscan.reduce_topics(docs, nr_topics=15)


2025-05-22 12:06:15,772 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2951 [00:00<?, ?it/s]

2025-05-22 12:06:31,089 - BERTopic - Embedding - Completed ✓
2025-05-22 12:06:31,090 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 12:06:52,320 - BERTopic - Dimensionality - Completed ✓
2025-05-22 12:06:52,321 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 12:41:28,218 - BERTopic - Cluster - Completed ✓
2025-05-22 12:41:28,231 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 12:41:28,897 - BERTopic - Representation - Completed ✓
2025-05-22 12:41:30,239 - BERTopic - Topic reduction - Reducing number of topics
2025-05-22 12:41:30,277 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 12:41:30,612 - BERTopic - Representation - Completed ✓
2025-05-22 12:41:30,617 - BERTopic - Topic reduction - Reduced number of topics from 544 to 15


<bertopic._bertopic.BERTopic at 0x14edd5213f0>

In [9]:
# Topic summary table
topic_info = topic_model_hdbscan.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_hdbscan.visualize_topics()
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_heatmap()


    Topic  Count                                    Name  \
0      -1  32089              -1_prabowo_02_dukung_pilih   
1       0  27601                0_02_dukung_pilih_pasang   
2       1  16775      1_presiden_calon_indonesia_prabowo   
3       2  11446           2_prabowo_gibran_jokowi_orang   
4       3   4861             3_debat_data_calon_presiden   
5       4    533          4_israel_palestina_dukung_bela   
6       5    351           5_viral_dokter_porno_viralkan   
7       6    212  6_ekonomi_tumbuh_prabowo_infrastruktur   
8       7    126            7_online_message_direct_ojek   
9       8    125              8_edit_reformasi_cape_orde   
10      9     84               9_sistem_alat_utama_tahan   
11     10     70       10_stres_teori_prabowo_konspirasi   
12     11     68        11_energi_hijau_gibran_indonesia   
13     12     47              12_tulis_tuli_prabowo_buta   
14     13     22                    13_00am_jalan_tun_02   

                                       

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [10]:
topic_info = topic_model_hdbscan.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,32089,-1_prabowo_02_dukung_pilih,"[prabowo, 02, dukung, pilih, orang, calon, gib...",[prabowo dukung jokowi migrasi prabowo kasihan...
1,0,27601,0_02_dukung_pilih_pasang,"[02, dukung, pilih, pasang, menang, orang, kay...","[orang all in 02 aneh banget otak banget, 02 m..."
2,1,16775,1_presiden_calon_indonesia_prabowo,"[presiden, calon, indonesia, prabowo, gibran, ...",[pasang anies baswedan cak imin amin untung ki...
3,2,11446,2_prabowo_gibran_jokowi_orang,"[prabowo, gibran, jokowi, orang, menang, pinta...",[jokowi pilih prabowo prabowo cocok calon pimp...
4,3,4861,3_debat_data_calon_presiden,"[debat, data, calon, presiden, media, berita, ...",[menang debat anies ganjar menang pilih presid...


In [11]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_hdbscan.get_topic(tid)[:15])  # top-15 words



Topic -1
[('prabowo', np.float64(0.048411360836698156)), ('02', np.float64(0.0443228913237586)), ('dukung', np.float64(0.032564888804654656)), ('pilih', np.float64(0.023733999301318916)), ('orang', np.float64(0.0210147958841153)), ('calon', np.float64(0.02097682603843247)), ('gibran', np.float64(0.019343423066841264)), ('pasang', np.float64(0.018882681493277154)), ('banget', np.float64(0.01631506840623672)), ('presiden', np.float64(0.014204891493625556))]

Topic 0
[('02', np.float64(0.07923414359691722)), ('dukung', np.float64(0.03462402711014225)), ('pilih', np.float64(0.026399803860566038)), ('pasang', np.float64(0.023593626184067493)), ('menang', np.float64(0.023311378250516088)), ('orang', np.float64(0.020773619919621098)), ('kayak', np.float64(0.01819755357382016)), ('banget', np.float64(0.01800404514235515)), ('buzzer', np.float64(0.01785336787451907)), ('calon', np.float64(0.0173564457302666))]

Topic 1
[('presiden', np.float64(0.0731264222896784)), ('calon', np.float64(0.04910

In [12]:
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [13]:
# df['created_at'] = pd.to_datetime(df['created_at'])

In [14]:
# topics_over_time = topic_model_hdbscan.topics_over_time(
#     docs=docs,
#     topics=topics_hdbscan,
#     timestamps=df['created_at'],
#     nr_bins=20,  # Adjust to control time granularity
#     evolution_tuning=True,
#     global_tuning=True
# )

In [15]:
# topic_model_hdbscan.visualize_topics_over_time(
#     topics_over_time, 
#     top_n_topics=10  # Number of top topics to show
# )


In [16]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_hdbscan})
df_topics.to_csv("topic_assignments_hdbscan.csv", index=False)

# Save model
# topic_model.save("bertopic_model_hdbscan")


In [17]:
from sklearn.cluster import KMeans

num_topics = 15
kmeans_model = KMeans(n_clusters=num_topics, random_state=42)

topic_model_kmeans = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=kmeans_model,
    calculate_probabilities=False,
    verbose=True
)

topics_kmeans, _ = topic_model_kmeans.fit_transform(docs)

topic_model_kmeans.visualize_barchart(top_n_topics=15)

2025-05-22 12:41:35,704 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2951 [00:00<?, ?it/s]

2025-05-22 12:41:51,971 - BERTopic - Embedding - Completed ✓
2025-05-22 12:41:51,972 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 12:42:00,393 - BERTopic - Dimensionality - Completed ✓
2025-05-22 12:42:00,396 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 12:42:00,648 - BERTopic - Cluster - Completed ✓
2025-05-22 12:42:00,660 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 12:42:01,102 - BERTopic - Representation - Completed ✓


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [18]:
# Topic summary table
topic_info = topic_model_kmeans.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_kmeans.visualize_topics()
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_heatmap()


    Topic  Count                                 Name  \
0       0  13797        0_prabowo_gibran_dukung_orang   
1       1  10325        1_prabowo_gibran_menang_orang   
2       2  10032             2_02_orang_dukung_pasang   
3       3   9214             3_02_dukung_pasang_pilih   
4       4   9089             4_02_dukung_pilih_buzzer   
5       5   6783        5_02_kampanye_agama_indonesia   
6       6   6692              6_kayak_02_suara_survei   
7       7   6178     7_gibran_rakabuming_raka_prabowo   
8       8   4486            8_presiden_calon_wakil_02   
9       9   4091             9_debat_02_program_calon   
10     10   3959      10_presiden_prabowo_calon_pilih   
11     11   3697             11_menang_02_teman_panik   
12     12   2157   12_politik_israel_palestina_polisi   
13     13   2055  13_demokrasi_partai_juang_indonesia   
14     14   1855      14_debat_prabowo_presiden_calon   

                                       Representation  \
0   [prabowo, gibran, dukung, 

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [19]:
topic_info = topic_model_kmeans.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,13797,0_prabowo_gibran_dukung_orang,"[prabowo, gibran, dukung, orang, jokowi, pilih...","[pikir dukung prabowo gibran kasihan coba, duk..."
1,1,10325,1_prabowo_gibran_menang_orang,"[prabowo, gibran, menang, orang, jokowi, anies...",[dukung ganjar amin alih prabowo gibran menang...
2,2,10032,2_02_orang_dukung_pasang,"[02, orang, dukung, pasang, pilih, banget, cal...",[prabowo orang pasang calon 02 mampu personal ...
3,3,9214,3_02_dukung_pasang_pilih,"[02, dukung, pasang, pilih, calon, buzzer, ban...","[prabowo salah duit dukung 02, dukung dukung p..."
4,4,9089,4_02_dukung_pilih_buzzer,"[02, dukung, pilih, buzzer, banget, bang, kala...","[dukung 02 nyebokin banget, 02 pilih takut, sa..."


In [20]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_kmeans.get_topic(tid)[:15])  # top-15 words



Topic 0
[('prabowo', np.float64(0.1537778624996075)), ('gibran', np.float64(0.03637419656866257)), ('dukung', np.float64(0.022999731988196617)), ('orang', np.float64(0.022511897939696122)), ('jokowi', np.float64(0.02157002575293172)), ('pilih', np.float64(0.019958625829038668)), ('ganjar', np.float64(0.01908576024333069)), ('anies', np.float64(0.01744948740431866)), ('mas', np.float64(0.015853876317828444)), ('pimpin', np.float64(0.015451577629413748))]

Topic 1
[('prabowo', np.float64(0.08621333534597744)), ('gibran', np.float64(0.035369486444867086)), ('menang', np.float64(0.02349582176145096)), ('orang', np.float64(0.022783173581615072)), ('jokowi', np.float64(0.022527001943444126)), ('anies', np.float64(0.02073026287562034)), ('ganjar', np.float64(0.020071821097078755)), ('dukung', np.float64(0.018567712473022528)), ('pilih', np.float64(0.018179818587281547)), ('rakyat', np.float64(0.01777177858283392))]

Topic 2
[('02', np.float64(0.06086855409374326)), ('orang', np.float64(0.042

In [21]:
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [22]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_kmeans})
df_topics.to_csv("topic_assignments_kmeans.csv", index=False)

# Save model
# topic_model.save("bertopic_model_kmeans")

In [23]:
topics_over_time_kmeans = topic_model_kmeans.topics_over_time(
    docs=docs,
    topics=topics_hdbscan,
    timestamps=df['created_at'],
    nr_bins=20,
    evolution_tuning=True,
    global_tuning=True
)

topic_model_hdbscan.visualize_topics_over_time(topics_over_time_kmeans, top_n_topics=10)

0it [00:00, ?it/s]


IndexError: index (540) out of range

In [None]:
print("HDBSCAN Topics:", len(set(topics_hdbscan)) - (1 if -1 in topics_hdbscan else 0))
print("KMeans Topics:", len(set(topics_kmeans)))

# Outlier analysis
print("HDBSCAN Outliers:", topics_hdbscan.count(-1))  # KMeans won't have outliers

HDBSCAN Topics: 205
KMeans Topics: 15
HDBSCAN Outliers: 9996


In [None]:
import pandas as pd
from collections import Counter

# Count documents per topic
hdbscan_counts = Counter(topics_hdbscan)
kmeans_counts = Counter(topics_kmeans)

# Create DataFrame to compare
comparison_df = pd.DataFrame({
    "HDBSCAN": pd.Series(hdbscan_counts),
    "KMeans": pd.Series(kmeans_counts)
}).fillna(0).astype(int)

comparison_df.sort_index()


Unnamed: 0,HDBSCAN,KMeans
-1,9996,0
0,941,3525
1,903,3463
2,792,3304
3,789,2851
...,...,...
200,16,0
201,16,0
202,15,0
203,15,0
