In [1]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [2]:
print("GPU available:", torch.cuda.is_available())

GPU available: True


In [3]:
df = pd.read_csv("../data/stemmed_merged_kubu_03.csv") 
print(df.columns)

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')


In [4]:
df['full_text'] = df['full_text'].str.replace(r'(wk)+|\bsih\b|\bya\b', '', regex=True)

In [5]:
docs = df['full_text'].dropna().astype(str).tolist()

In [6]:
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
embedding_model.to('cuda' if torch.cuda.is_available() else 'cpu')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [7]:
pca_model = PCA(n_components=25)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
)


In [8]:
topic_model_hdbscan = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

topics_hdbscan, probs = topic_model_hdbscan.fit_transform(docs)

topic_model_hdbscan.reduce_topics(docs, nr_topics=15)


2025-05-22 14:49:03,397 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1183 [00:00<?, ?it/s]

2025-05-22 14:49:09,438 - BERTopic - Embedding - Completed ✓
2025-05-22 14:49:09,439 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 14:49:26,322 - BERTopic - Dimensionality - Completed ✓
2025-05-22 14:49:26,323 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 14:51:21,344 - BERTopic - Cluster - Completed ✓
2025-05-22 14:51:21,349 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 14:51:21,627 - BERTopic - Representation - Completed ✓
2025-05-22 14:51:22,092 - BERTopic - Topic reduction - Reducing number of topics
2025-05-22 14:51:22,112 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 14:51:22,242 - BERTopic - Representation - Completed ✓
2025-05-22 14:51:22,245 - BERTopic - Topic reduction - Reduced number of topics from 284 to 15


<bertopic._bertopic.BERTopic at 0x20846f406a0>

In [9]:
# Topic summary table
topic_info = topic_model_hdbscan.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_hdbscan.visualize_topics()
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_heatmap()


    Topic  Count                                      Name  \
0      -1  11348                -1_ganjar_03_mahfud_dukung   
1       0  15611          0_ganjar_mahfud_indonesia_dukung   
2       1   4483         1_presiden_calon_demokrasi_partai   
3       2   3182                   2_03_kayak_suara_pasang   
4       3    958  3_program_koruptor_nusakambangan_penjara   
5       4    589              4_debat_presiden_calon_wakil   
6       5    469         5_internet_gratis_digital_program   
7       6    444            6_ekonomi_investasi_mahfud_set   
8       7    228                 7_porno_video_tonton_suka   
9       8    169       8_strategi_strategis_terobos_ganjar   
10      9    147                9_mikro_tengah_usaha_lokal   
11     10     74            10_kartu_duduk_tanda_elektonik   
12     11     50              11_produk_lokal_keren_ganjar   
13     12     49                    12_data_omon_bantah_03   
14     13     34                       13_all_in_ganjar_sp   

       

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [10]:
topic_info = topic_model_hdbscan.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11348,-1_ganjar_03_mahfud_dukung,"[ganjar, 03, mahfud, dukung, rakyat, pilih, pi...","[tuju banget ganjar calon pimpin butuh rakyat,..."
1,0,15611,0_ganjar_mahfud_indonesia_dukung,"[ganjar, mahfud, indonesia, dukung, rakyat, pi...","[ganjar bukti rakyat cinta dukung ganjar, seja..."
2,1,4483,1_presiden_calon_demokrasi_partai,"[presiden, calon, demokrasi, partai, pilih, po...",[calon wakil presiden pilih ganjar mahfud md b...
3,2,3182,2_03_kayak_suara_pasang,"[03, kayak, suara, pasang, menang, dukung, cal...","[ganjar banget 03 menang putar, kali 03 mena..."
4,3,958,3_program_koruptor_nusakambangan_penjara,"[program, koruptor, nusakambangan, penjara, ga...",[ganjar dukung program ganjar mahfud indonesia...


In [11]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_hdbscan.get_topic(tid)[:15])  # top-15 words



Topic -1
[('ganjar', np.float64(0.05542516904901977)), ('03', np.float64(0.04586229441262075)), ('mahfud', np.float64(0.02990034932404309)), ('dukung', np.float64(0.028830228745348723)), ('rakyat', np.float64(0.024296669838062553)), ('pilih', np.float64(0.021372794463426697)), ('pimpin', np.float64(0.0199076514151934)), ('banget', np.float64(0.018651229278200544)), ('masyarakat', np.float64(0.01570585572500672)), ('pranowo', np.float64(0.015296283277488954))]

Topic 0
[('ganjar', np.float64(0.06765518635499278)), ('mahfud', np.float64(0.04253791512899599)), ('indonesia', np.float64(0.02855524636175078)), ('dukung', np.float64(0.02814791364014002)), ('rakyat', np.float64(0.026110202772534268)), ('pimpin', np.float64(0.023851783353937204)), ('menang', np.float64(0.02363999726589821)), ('pranowo', np.float64(0.0210354601787112)), ('orang', np.float64(0.020435741303809374)), ('masyarakat', np.float64(0.018991144447769465))]

Topic 1
[('presiden', np.float64(0.10822111480379051)), ('calon'

In [12]:
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [13]:
# df['created_at'] = pd.to_datetime(df['created_at'])

In [14]:
# topics_over_time = topic_model_hdbscan.topics_over_time(
#     docs=docs,
#     topics=topics_hdbscan,
#     timestamps=df['created_at'],
#     nr_bins=20,  # Adjust to control time granularity
#     evolution_tuning=True,
#     global_tuning=True
# )

In [15]:
# topic_model_hdbscan.visualize_topics_over_time(
#     topics_over_time, 
#     top_n_topics=10  # Number of top topics to show
# )


In [16]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_hdbscan})
df_topics.to_csv("topic_assignments_hdbscan.csv", index=False)

# Save model
# topic_model.save("bertopic_model_hdbscan")


In [17]:
from sklearn.cluster import KMeans

num_topics = 15
kmeans_model = KMeans(n_clusters=num_topics, random_state=42)

topic_model_kmeans = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=kmeans_model,
    calculate_probabilities=False,
    verbose=True
)

topics_kmeans, _ = topic_model_kmeans.fit_transform(docs)

topic_model_kmeans.visualize_barchart(top_n_topics=15)

2025-05-22 14:51:27,348 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1183 [00:00<?, ?it/s]

2025-05-22 14:51:34,208 - BERTopic - Embedding - Completed ✓
2025-05-22 14:51:34,209 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 14:51:37,027 - BERTopic - Dimensionality - Completed ✓
2025-05-22 14:51:37,029 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 14:51:37,197 - BERTopic - Cluster - Completed ✓
2025-05-22 14:51:37,205 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 14:51:37,397 - BERTopic - Representation - Completed ✓


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [18]:
# Topic summary table
topic_info = topic_model_kmeans.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_kmeans.visualize_topics()
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_heatmap()


    Topic  Count                                       Name  \
0       0   6637              0_ganjar_dukung_rakyat_mahfud   
1       1   6593                   1_03_pasang_calon_dukung   
2       2   4794               2_ganjar_orang_mahfud_dukung   
3       3   2467               3_ganjar_kampanye_prof_kayak   
4       4   2344                 4_ganjar_mahfud_muda_jalan   
5       5   2250              5_presiden_calon_pilih_ganjar   
6       6   2172                  6_menang_bal_ganjar_cinta   
7       7   2070             7_politik_debat_presiden_calon   
8       8   2008           8_indonesia_ganjar_mahfud_pimpin   
9       9   1622           9_program_gratis_internet_ganjar   
10     10   1376               10_kerja_fokus_ganjar_mahfud   
11     11   1254           11_positif_ekonomi_mahfud_ganjar   
12     12   1171        12_demokrasi_partai_juang_indonesia   
13     13    826            13_suara_palestina_israel_porno   
14     14    251  14_koruptor_nusakambangan_penjara_kor

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [19]:
topic_info = topic_model_kmeans.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,6637,0_ganjar_dukung_rakyat_mahfud,"[ganjar, dukung, rakyat, mahfud, pimpin, bange...",[ganjar mantap bangga banget ganjar moga pimpi...
1,1,6593,1_03_pasang_calon_dukung,"[03, pasang, calon, dukung, pilih, menang, sua...","[ganjar pilih bikin sejahtera 03, terima pilih..."
2,2,4794,2_ganjar_orang_mahfud_dukung,"[ganjar, orang, mahfud, dukung, rakyat, pimpin...","[ganjar anak ganjar orang tulus, ganjar ganjar..."
3,3,2467,3_ganjar_kampanye_prof_kayak,"[ganjar, kampanye, prof, kayak, mahfud, survei...",[momen puncak kampanye ganjar pranowo prof mah...
4,4,2344,4_ganjar_mahfud_muda_jalan,"[ganjar, mahfud, muda, jalan, hukum, negeri, a...",[ganjar mahfud pilih rakyat jawa barat inspira...


In [20]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_kmeans.get_topic(tid)[:15])  # top-15 words



Topic 0
[('ganjar', np.float64(0.10563637831023052)), ('dukung', np.float64(0.039253406693153955)), ('rakyat', np.float64(0.03797676614604729)), ('mahfud', np.float64(0.03634624353112381)), ('pimpin', np.float64(0.029184304032929667)), ('banget', np.float64(0.028227703703532742)), ('masyarakat', np.float64(0.025886835132074383)), ('pilih', np.float64(0.02554368724689994)), ('moga', np.float64(0.02199660942658145)), ('mantap', np.float64(0.019620198379063956))]

Topic 1
[('03', np.float64(0.15683701830767569)), ('pasang', np.float64(0.03871601333957629)), ('calon', np.float64(0.029484046682026074)), ('dukung', np.float64(0.029402543356155556)), ('pilih', np.float64(0.027603276812517376)), ('menang', np.float64(0.025454319227896312)), ('suara', np.float64(0.02076056016178918)), ('coblos', np.float64(0.020232962010484634)), ('banget', np.float64(0.01687086317780825)), ('sejahtera', np.float64(0.015940294276725556))]

Topic 2
[('ganjar', np.float64(0.06285569407370847)), ('orang', np.floa

In [21]:
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [22]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_kmeans})
df_topics.to_csv("topic_assignments_kmeans.csv", index=False)

# Save model
# topic_model.save("bertopic_model_kmeans")

In [23]:
topics_over_time_kmeans = topic_model_kmeans.topics_over_time(
    docs=docs,
    topics=topics_hdbscan,
    timestamps=df['created_at'],
    nr_bins=20,
    evolution_tuning=True,
    global_tuning=True
)

topic_model_hdbscan.visualize_topics_over_time(topics_over_time_kmeans, top_n_topics=10)

0it [00:00, ?it/s]


IndexError: index (279) out of range

In [None]:
print("HDBSCAN Topics:", len(set(topics_hdbscan)) - (1 if -1 in topics_hdbscan else 0))
print("KMeans Topics:", len(set(topics_kmeans)))

# Outlier analysis
print("HDBSCAN Outliers:", topics_hdbscan.count(-1))  # KMeans won't have outliers

HDBSCAN Topics: 205
KMeans Topics: 15
HDBSCAN Outliers: 9996


In [None]:
import pandas as pd
from collections import Counter

# Count documents per topic
hdbscan_counts = Counter(topics_hdbscan)
kmeans_counts = Counter(topics_kmeans)

# Create DataFrame to compare
comparison_df = pd.DataFrame({
    "HDBSCAN": pd.Series(hdbscan_counts),
    "KMeans": pd.Series(kmeans_counts)
}).fillna(0).astype(int)

comparison_df.sort_index()


Unnamed: 0,HDBSCAN,KMeans
-1,9996,0
0,941,3525
1,903,3463
2,792,3304
3,789,2851
...,...,...
200,16,0
201,16,0
202,15,0
203,15,0
