In [1]:
from bertopic import BERTopic 
from sklearn.datasets import fetch_20newsgroups 
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_01 = pd.read_csv('data/stemmed_merged_kubu_01.csv')
df_02 = pd.read_csv('data/stemmed_merged_kubu_02.csv')
df_03 = pd.read_csv('data/stemmed_merged_kubu_03.csv')

In [None]:
# Initialize and train the BERTopic model (using multilingual model for Indonesian)
topic_model = BERTopic(language="multilingual", 
                       calculate_probabilities=True,
                       min_topic_size=5,  # Minimum number of documents per topic
                       nr_topics="auto")  # Let BERTopic decide the optimal number

# Train the model
topics, probs = topic_model.fit_transform(df_01['full_text'].tolist())

# Get an overview of the topics
topic_info = topic_model.get_topic_info()
print("\nTopic Information:")
print(topic_info.head(10))

In [None]:
# Print the top topics with their keywords
print("\nTop Words per Topic:")
for topic_id, words in topic_model.get_topics().items():
    if topic_id != -1:  # -1 is the outlier topic
        words_str = ", ".join([word[0] for word in words[:10]])
        print(f"Topic {topic_id}: {words_str}")

# Get document distribution across topics
topic_distribution = pd.Series(topics).value_counts()
print("\nDocument distribution across topics:")
print(topic_distribution.head(10))

# Add topic labels to the original data
df_01['topic'] = topics
df_01['topic_probability'] = [prob.max() for prob in probs]

In [None]:
# Topic word clouds
def plot_topic_wordcloud(topic_model, topic_id, title):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Get words and weights
    words = [word[0] for word in topic_model.get_topic(topic_id)]
    weights = [word[1] for word in topic_model.get_topic(topic_id)]
    
    # Create frequency dictionary
    word_freq = {words[i]: weights[i] for i in range(len(words))}
    
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, 
                         background_color='white',
                         colormap='viridis').generate_from_frequencies(word_freq)
    
    # Display wordcloud
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.tight_layout()
    return fig

# Create word clouds for top 3 topics
for i in range(3):
    if i in topic_model.get_topics():
        fig = plot_topic_wordcloud(topic_model, i, f"Topic {i} Word Cloud")
        plt.show()

# Plot interactive topic visualizations
topic_vis = topic_model.visualize_topics()
topic_vis.show()

# Plot hierarchical clustering of topics
hierarchy_vis = topic_model.visualize_hierarchy()
hierarchy_vis.show()

# Plot topic similarity heatmap
heatmap_vis = topic_model.visualize_heatmap()
heatmap_vis.show()

In [None]:
# Get representative documents for top topics
print("\nRepresentative tweets for top topics:")
top_topics = [t for t in topic_distribution.index if t != -1][:5]

for topic_id in top_topics:
    print(f"\n--- Topic {topic_id} ---")
    rep_docs = topic_model.get_representative_docs(topic_id)
    for i, doc in enumerate(rep_docs[:3]):  # Show first 3 documents
        print(f"{i+1}. {doc[:200]}...")  # Truncate long tweets