In [1]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("GPU available:", torch.cuda.is_available())

GPU available: False


In [3]:
df = pd.read_csv("../data/stemmed_merged_kubu_03.csv") 
print(df.columns)

Index(['created_at', 'id_str', 'full_text', 'lang', 'location'], dtype='object')


In [4]:
df['full_text'] = df['full_text'].str.replace(r'(wk)+|\bsih\b|\bya\b', '', regex=True)

In [5]:
docs = df['full_text'].dropna().astype(str).tolist()

In [6]:
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
embedding_model.to('cuda' if torch.cuda.is_available() else 'cpu')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [7]:
pca_model = PCA(n_components=25)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
)


In [8]:
topic_model_hdbscan = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=True,
    verbose=True
)

topics_hdbscan, probs = topic_model_hdbscan.fit_transform(docs)

topic_model_hdbscan.reduce_topics(docs, nr_topics=15)


2025-05-21 23:29:54,689 - BERTopic - Embedding - Transforming documents to embeddings.


Batches: 100%|██████████| 1183/1183 [06:23<00:00,  3.08it/s]
2025-05-21 23:36:18,999 - BERTopic - Embedding - Completed ✓
2025-05-21 23:36:19,000 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-21 23:36:48,969 - BERTopic - Dimensionality - Completed ✓
2025-05-21 23:36:48,971 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-21 23:40:15,273 - BERTopic - Cluster - Completed ✓
2025-05-21 23:40:15,286 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-21 23:40:15,885 - BERTopic - Representation - Completed ✓
2025-05-21 23:40:16,913 - BERTopic - Topic reduction - Reducing number of topics
2025-05-21 23:40:16,952 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-21 23:40:17,229 - BERTopic - Representation - Completed ✓
2025-05-21 23:40:17,229 - BERTopic - Topic reduction - Reduced number of topics from 279 to 15


<bertopic._bertopic.BERTopic at 0x2305a30be00>

In [9]:
# Topic summary table
topic_info = topic_model_hdbscan.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_hdbscan.visualize_topics()
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_heatmap()


    Topic  Count                                      Name  \
0      -1  10771                -1_ganjar_03_mahfud_dukung   
1       0  15368          0_ganjar_mahfud_dukung_indonesia   
2       1   4600                   1_03_calon_kayak_pasang   
3       2   3452            2_presiden_program_calon_pilih   
4       3   1605          3_demokrasi_partai_juang_politik   
5       4    720             4_debat_calon_presiden_kritik   
6       5    339           5_ekonomi_mahfud_digital_tumbuh   
7       6    234                 6_porno_video_tonton_suka   
8       7    168       7_strategi_strategis_terobos_ganjar   
9       8    162                8_berita_media_sosial_baca   
10      9    148                9_mikro_tengah_usaha_lokal   
11     10     94  10_infrastruktur_reformasi_bangun_mahfud   
12     11     81            11_kartu_duduk_tanda_elektonik   
13     12     48              12_produk_lokal_keren_ganjar   
14     13     45                 13_data_omon_fakta_bantah   

       

In [10]:
topic_info = topic_model_hdbscan.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10771,-1_ganjar_03_mahfud_dukung,"[ganjar, 03, mahfud, dukung, rakyat, pilih, pi...",[ganjar mahfud pimpin peduli rakyat beliau but...
1,0,15368,0_ganjar_mahfud_dukung_indonesia,"[ganjar, mahfud, dukung, indonesia, rakyat, pi...",[ganjar semangat bangun indonesia ganjar prano...
2,1,4600,1_03_calon_kayak_pasang,"[03, calon, kayak, pasang, menang, presiden, p...",[lihat masyarakat kampanye putar pasang calon ...
3,2,3452,2_presiden_program_calon_pilih,"[presiden, program, calon, pilih, ganjar, grat...",[ganjar dukung mantap ganjar mahfud pilih pres...
4,3,1605,3_demokrasi_partai_juang_politik,"[demokrasi, partai, juang, politik, indonesia,...",[keluarga jokowi tinggal partai demokrasi indo...


In [11]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_hdbscan.get_topic(tid)[:15])  # top-15 words



Topic -1
[('ganjar', np.float64(0.05652350243096272)), ('03', np.float64(0.043262546079148985)), ('mahfud', np.float64(0.027998730435209413)), ('dukung', np.float64(0.027836977424174594)), ('rakyat', np.float64(0.024898435693233134)), ('pilih', np.float64(0.0219970167831874)), ('pimpin', np.float64(0.020395864919268647)), ('banget', np.float64(0.01950344588555811)), ('masyarakat', np.float64(0.01606574386561848)), ('indonesia', np.float64(0.015224138469174307))]

Topic 0
[('ganjar', np.float64(0.06847348166283955)), ('mahfud', np.float64(0.04339154903346606)), ('dukung', np.float64(0.02852798716683141)), ('indonesia', np.float64(0.026145666455745835)), ('rakyat', np.float64(0.02579178714921937)), ('pimpin', np.float64(0.024357290543456738)), ('menang', np.float64(0.02225396487880074)), ('orang', np.float64(0.02081044715717578)), ('pranowo', np.float64(0.020026716804087498)), ('masyarakat', np.float64(0.01902181268175712))]

Topic 1
[('03', np.float64(0.1290682381423728)), ('calon', np

In [12]:
topic_model_hdbscan.visualize_barchart(top_n_topics=15)
topic_model_hdbscan.visualize_topics()

In [13]:
# df['created_at'] = pd.to_datetime(df['created_at'])

In [14]:
# topics_over_time = topic_model_hdbscan.topics_over_time(
#     docs=docs,
#     topics=topics_hdbscan,
#     timestamps=df['created_at'],
#     nr_bins=20,  # Adjust to control time granularity
#     evolution_tuning=True,
#     global_tuning=True
# )

In [15]:
# topic_model_hdbscan.visualize_topics_over_time(
#     topics_over_time, 
#     top_n_topics=10  # Number of top topics to show
# )


In [16]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_hdbscan})
df_topics.to_csv("topic_assignments_hdbscan.csv", index=False)

# Save model
# topic_model.save("bertopic_model_hdbscan")


In [17]:
from sklearn.cluster import KMeans

num_topics = 15
kmeans_model = KMeans(n_clusters=num_topics, random_state=42)

topic_model_kmeans = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=kmeans_model,
    calculate_probabilities=False,
    verbose=True
)

topics_kmeans, _ = topic_model_kmeans.fit_transform(docs)

topic_model_kmeans.visualize_barchart(top_n_topics=15)

2025-05-21 23:40:24,190 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1183/1183 [06:19<00:00,  3.12it/s]
2025-05-21 23:46:44,177 - BERTopic - Embedding - Completed ✓
2025-05-21 23:46:44,178 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-21 23:46:53,326 - BERTopic - Dimensionality - Completed ✓
2025-05-21 23:46:53,329 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-21 23:46:55,315 - BERTopic - Cluster - Completed ✓
2025-05-21 23:46:55,326 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-21 23:46:55,718 - BERTopic - Representation - Completed ✓


In [18]:
# Topic summary table
topic_info = topic_model_kmeans.get_topic_info()
print(topic_info)

# Interactive visualizations
topic_model_kmeans.visualize_topics()
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_heatmap()


    Topic  Count                                       Name  \
0       0   7390              0_ganjar_dukung_mahfud_rakyat   
1       1   6753               1_ganjar_menang_orang_mahfud   
2       2   3807                   2_03_pilih_pasang_dukung   
3       3   2779                    3_03_pasang_suara_calon   
4       4   2667             4_ganjar_mahfud_kerja_kampanye   
5       5   2251              5_presiden_calon_pilih_ganjar   
6       6   2011              6_positif_mahfud_ekonomi_prof   
7       7   2003           7_indonesia_ganjar_mahfud_pimpin   
8       8   1887                8_kayak_ganjar_survei_rumah   
9       9   1662           9_program_gratis_internet_ganjar   
10     10   1236                 10_debat_presiden_calon_03   
11     11   1174        11_demokrasi_partai_juang_indonesia   
12     12   1138            12_suara_agama_palestina_israel   
13     13    825            13_politik_polisi_ganjar_mahfud   
14     14    252  14_koruptor_nusakambangan_penjara_kor

In [19]:
topic_info = topic_model_kmeans.get_topic_info()
display(topic_info.head())        

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,7390,0_ganjar_dukung_mahfud_rakyat,"[ganjar, dukung, mahfud, rakyat, pimpin, bange...",[ganjar baik ganjar pilih rakyat ganjar pranow...
1,1,6753,1_ganjar_menang_orang_mahfud,"[ganjar, menang, orang, mahfud, dukung, anak, ...",[ganjar mahfud menang orang milik alam hebat p...
2,2,3807,2_03_pilih_pasang_dukung,"[03, pilih, pasang, dukung, calon, coblos, men...","[no 03 menang, percaya 03, dukung 03 banget ]"
3,3,2779,3_03_pasang_suara_calon,"[03, pasang, suara, calon, menang, dukung, sej...",[semangat masyarakat energi menang pasang calo...
4,4,2667,4_ganjar_mahfud_kerja_kampanye,"[ganjar, mahfud, kerja, kampanye, jalan, hukum...",[dukung penuh rakyat jawa barat ganjar mahfud ...


In [20]:
for tid in topic_info['Topic'][:15]:   # first 10 topics
    print(f"\nTopic {tid}")
    print(topic_model_kmeans.get_topic(tid)[:15])  # top-15 words



Topic 0
[('ganjar', np.float64(0.10188270265106636)), ('dukung', np.float64(0.04058778577884403)), ('mahfud', np.float64(0.03776063621588905)), ('rakyat', np.float64(0.037079257542527215)), ('pimpin', np.float64(0.029443217793787196)), ('banget', np.float64(0.02723951667765711)), ('masyarakat', np.float64(0.025477968643717916)), ('pilih', np.float64(0.02492776571291783)), ('moga', np.float64(0.021029463617865764)), ('semangat', np.float64(0.018481215987589606))]

Topic 1
[('ganjar', np.float64(0.06806972721198605)), ('menang', np.float64(0.03648038200840639)), ('orang', np.float64(0.035643537659914674)), ('mahfud', np.float64(0.03381059868859819)), ('dukung', np.float64(0.02704393159365117)), ('anak', np.float64(0.025625435469087524)), ('bal', np.float64(0.025098862542817884)), ('rakyat', np.float64(0.023440659117470244)), ('pimpin', np.float64(0.02212536915999303)), ('masyarakat', np.float64(0.020762407489783738))]

Topic 2
[('03', np.float64(0.24363197038931647)), ('pilih', np.float

In [21]:
topic_model_kmeans.visualize_barchart(top_n_topics=15)
topic_model_kmeans.visualize_topics()

In [22]:
# Save results to CSV
df_topics = pd.DataFrame({"Document": docs, "Topic": topics_kmeans})
df_topics.to_csv("topic_assignments_kmeans.csv", index=False)

# Save model
# topic_model.save("bertopic_model_kmeans")

In [23]:
topics_over_time_kmeans = topic_model_kmeans.topics_over_time(
    docs=docs,
    topics=topics_hdbscan,
    timestamps=df['created_at'],
    nr_bins=20,
    evolution_tuning=True,
    global_tuning=True
)

topic_model_hdbscan.visualize_topics_over_time(topics_over_time_kmeans, top_n_topics=10)

0it [00:00, ?it/s]


IndexError: index (276) out of range

In [None]:
print("HDBSCAN Topics:", len(set(topics_hdbscan)) - (1 if -1 in topics_hdbscan else 0))
print("KMeans Topics:", len(set(topics_kmeans)))

# Outlier analysis
print("HDBSCAN Outliers:", topics_hdbscan.count(-1))  # KMeans won't have outliers

HDBSCAN Topics: 205
KMeans Topics: 15
HDBSCAN Outliers: 9996


In [None]:
import pandas as pd
from collections import Counter

# Count documents per topic
hdbscan_counts = Counter(topics_hdbscan)
kmeans_counts = Counter(topics_kmeans)

# Create DataFrame to compare
comparison_df = pd.DataFrame({
    "HDBSCAN": pd.Series(hdbscan_counts),
    "KMeans": pd.Series(kmeans_counts)
}).fillna(0).astype(int)

comparison_df.sort_index()


Unnamed: 0,HDBSCAN,KMeans
-1,9996,0
0,941,3525
1,903,3463
2,792,3304
3,789,2851
...,...,...
200,16,0
201,16,0
202,15,0
203,15,0
