In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

In [None]:
df_02 = pd.read_csv('data/stemmed_merged_kubu_02.csv')
df_02['created_at'] = pd.to_datetime(df_02['created_at'])
df_02['date'] = df_02['created_at'].dt.date

In [None]:
# Enhanced stopwords list for presidential election dataset
indo_stopwords = [
    # Basic Indonesian stopwords (keep these)
    "yang", "dan", "di", "dengan", "untuk", "dari", "ke", "pada", "ini", "itu",
    "atau", "ada", "juga", "dalam", "saya", "kamu", "anda", "mereka", "kita", "kami",
    "dia", "nya", "adalah", "akan", "oleh", "seperti", "telah", "sudah", "bisa", "dapat",
    "tidak", "tak", "jangan", "ya", "via", "yg", "dgn", "utk", "dr", "pd", "dlm", "sdh", "tdk",
    
    # Common tweet/social media terms
    "via", "rt", "dm", "pm", "cc", "wkwk", "haha", "woi", "hey", "ah", "eh", "oh", "hmm",
    "uwu", "btw", "omg", "lol", "min", "admin", "hehe", "wkwkwk", "awokawok", "hahaha",

        # Locations 
    "indonesia", "jakarta", "jawa", "barat", "timur", "tengah", "sumatra", "bali", "aceh", "selatan", 
    "utara", "kota", "desa", "daerah", "wilayah", "provinsi", "kabupaten", "negara", "republik",

    "mas", "pak", "bu", "bapak", "ibu", "om", "tante", "mbak", "bro", "sis", "bang", "abang",
    "saudara", "laki", "wanita", "kak", "adek", "adik", "kakak", "anak", "bung",

    #candidate names
    "prabowo", "jokowi", "gibran", "subianto", "rakabuming", "anies", "ganjar", "raka"
    
]

In [None]:

vectorizer = CountVectorizer(
    stop_words=indo_stopwords,
    ngram_range=(1, 3),  # Keep bigrams and trigrams for political phrases
    min_df=5,            # Lower threshold to capture more specific topics
    max_df=0.7           # Stricter upper bound to remove very common terms
)

# Configure UMAP for better clustering of political content
umap_model = UMAP(
    n_neighbors=15,      # Smaller neighborhood for more specific clusters
    n_components=5,      
    min_dist=0.0,        # Adjusted for better separation
    metric='cosine',
    random_state=42
)

# More sensitive clustering for political topics
hdbscan_model = HDBSCAN(
    min_cluster_size=10,  # Smaller clusters to capture specific campaign themes
    min_samples=5,       # More lenient sample requirement
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# Improved c-TF-IDF weighting
ctfidf_model = ClassTfidfTransformer(
    reduce_frequent_words=True,
    bm25_weighting=True
)

embedding_model = SentenceTransformer(
    'distiluse-base-multilingual-cased-v1'
)

# Configure BERTopic for Indonesian political content
topic_model = BERTopic(
    language="multilingual",  # Keep multilingual for Indonesian
    calculate_probabilities=True,
    verbose=True,
    nr_topics=15,             # Reduced from 15 to get more focused topics
    min_topic_size=10,         # Smaller topics to capture campaign themes
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    vectorizer_model=vectorizer,
    embedding_model=embedding_model,
)

timestamps = df_02['created_at'].tolist()

# Fit the model
topics, probs = topic_model.fit_transform(df_02['full_text'].tolist())

2025-05-22 19:25:04,222 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2951 [00:00<?, ?it/s]

2025-05-22 19:25:19,207 - BERTopic - Embedding - Completed ✓
2025-05-22 19:25:19,208 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 19:26:16,727 - BERTopic - Dimensionality - Completed ✓
2025-05-22 19:26:16,728 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 22:02:41,757 - BERTopic - Cluster - Completed ✓
2025-05-22 22:02:41,758 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-22 22:02:43,611 - BERTopic - Representation - Completed ✓
2025-05-22 22:02:43,611 - BERTopic - Topic reduction - Reducing number of topics
2025-05-22 22:02:43,661 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 22:02:45,006 - BERTopic - Representation - Completed ✓
2025-05-22 22:02:45,011 - BERTopic - Topic reduction - Reduced number of topics from 910 to 15


In [None]:
# Create a topics over time analysis
topics_over_time = topic_model.topics_over_time(
    docs=df_02['full_text'].tolist(),
    timestamps=timestamps,
    global_tuning=True,
    evolution_tuning=True,
    nr_bins=14  # Roughly 1 bin per day for a two-week analysis
)

# Create custom topic labels based on top words
topic_labels = {}
for topic_id in sorted(list(set(topics_over_time.Topic.unique()))):
    if topic_id == -1:
        topic_labels[topic_id] = "Outlier"
        continue
        
    # Get top words for each topic
    words = [word for word, _ in topic_model.get_topic(topic_id)[:5]]
    
    # Map topics based on keywords
    if any(term in words for term in ["debat", "pidato", "joget", "acara"]):
        topic_labels[topic_id] = "Gaya Kampanye & Debat"
    elif any(term in words for term in ["konstitusi", "mk", "etik", "putus", "hakim", "anwar", "usman"]):
        topic_labels[topic_id] = "Kontroversi MK & Etika"
    elif any(term in words for term in ["elektabilitas", "survei", "unggul", "menang", "putar"]):
        topic_labels[topic_id] = "Survei & Elektabilitas"
    elif any(term in words for term in ["partai", "bangkit", "demokrasi", "politik"]):
        topic_labels[topic_id] = "Dukungan Partai Politik" 
    elif any(term in words for term in ["nomor", "urut", "daftar", "komisi", "pemelihan"]):
        topic_labels[topic_id] = "Proses Electoral & KPU"
    elif any(term in words for term in ["dinasti", "keluarga", "politik", "jokowi"]):
        topic_labels[topic_id] = "Polemik Dinasti Politik"
    elif any(term in words for term in ["palestina", "doa", "israel", "gaza"]):
        topic_labels[topic_id] = "Isu Palestina"
    elif any(term in words for term in ["ekonomi", "makmur", "sejahtera", "kerja"]):
        topic_labels[topic_id] = "Visi Ekonomi & Kesejahteraan"
    elif any(term in words for term in ["aparat", "netral", "pilih", "baliho"]):
        topic_labels[topic_id] = "Netralitas Aparat & Kampanye"
    elif any(term in words for term in ["curang", "bersih", "adil", "jujur"]):
        topic_labels[topic_id] = "Integritas Pemilu"
    else:
        # Use top 3 words for other topics
        topic_labels[topic_id] = " & ".join(words[:3])

# Apply the custom labels
topic_model.set_topic_labels(topic_labels)

# Update the topic labels in topics_over_time 
topics_over_time["Name"] = topics_over_time["Topic"].map(
    lambda x: topic_model.get_topic_info().set_index("Topic").loc[x, "Name"]
    if x in topic_model.get_topic_info()["Topic"].values else f"Topic {x}")

In [None]:
from wordcloud import WordCloud
import seaborn as sns
# import mdates
import matplotlib.dates as mdates

In [None]:
# Create a line plot showing topic prevalence over time
plt.figure(figsize=(20, 10))

# Get top topics excluding outliers
top_topics = [topic for topic in topic_model.get_topic_info()["Topic"][:8] if topic != -1]
topic_names = [topic_model.get_topic_info().loc[topic_model.get_topic_info()["Topic"] == topic, "Name"].values[0] 
               for topic in top_topics]

# Filter the topics_over_time DataFrame to include only top topics
filtered_tot = topics_over_time[topics_over_time["Topic"].isin(top_topics)]

# Create a pivot table for easier plotting
pivot_data = pd.pivot_table(
    data=filtered_tot,
    index="Timestamp", 
    columns="Name",
    values="Frequency",
    aggfunc="sum"
).fillna(0)

# Plot the data with a better style
ax = pivot_data.plot(kind='line', marker='o', markersize=8, figsize=(20, 10), linewidth=2.5)

# Format the plot
plt.title("Evolusi Topik Pembicaraan Masyarakat Terkait Calon Prabowo-Gibran (November 2023)", fontsize=18, fontweight='bold')
plt.xlabel("Tanggal", fontsize=14)
plt.ylabel("Frekuensi Topik", fontsize=14)
plt.grid(True, alpha=0.3)
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12)
plt.legend(title="Tema Kampanye", fontsize=12, title_fontsize=14)
plt.tight_layout()

# Format date axis to be more readable
date_format = mdates.DateFormatter('%d-%m-%Y')
ax.xaxis.set_major_formatter(date_format)
plt.grid(True, linestyle='--', alpha=0.7)

# Add date markers for key events
key_events = {
    "2023-11-25": "Akhir Masa Pencalonan Presiden",
    "2023-11-28": "Awal Masa Kampanye",
    "2023-12-12": "Debat Pertama (Capres)",
    "2023-12-22": "Debat Kedua (Cawapres)",
    "2024-01-07": "Debat Ketiga (Capres)",
    "2024-01-21": "Debat Keempat (Cawapres)",
    "2024-02-04": "Debat Kelima (Capres)",
    "2024-02-10": "Akhir Masa Kampanye",
    "2024-02-14": "Pemungutan Suara"
}

# Add vertical lines for key events
for date_str, event_name in key_events.items():
    event_date = pd.to_datetime(date_str)
    if event_date >= pivot_data.index.min() and event_date <= pivot_data.index.max():
        plt.axvline(x=event_date, color='red', linestyle='--', alpha=0.7)
        plt.text(event_date, plt.ylim()[1]*0.95, event_name, rotation=90, 
                 verticalalignment='top', fontsize=12, fontweight='bold')

plt.show()

# Area chart showing the full topic composition over time
plt.figure(figsize=(20, 10))

# Create a stacked area chart
pivot_data_all = pd.pivot_table(
    data=topics_over_time[topics_over_time["Topic"] != -1],  # Exclude outliers
    index="Timestamp", 
    columns="Name",
    values="Frequency",
    aggfunc="sum"
).fillna(0)

# Plot as a stacked area chart
pivot_data_all.plot.area(figsize=(20, 10), alpha=0.7, linewidth=0, stacked=True, colormap='viridis')

# Format the plot
plt.title("Komposisi Topik Pembicaraan Masyarakat Terkait Calon Prabowo-Gibran dari Waktu ke Waktu", fontsize=18, fontweight='bold')
plt.xlabel("Tanggal", fontsize=14)
plt.ylabel("Proporsi Tema", fontsize=14)
plt.grid(True, alpha=0.3)
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12)
plt.legend(title="Tema Kampanye", fontsize=10, title_fontsize=12, 
          bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Create a heatmap for topics over time to easily view patterns
plt.figure(figsize=(20, 12))

# Create a pivot table for the heatmap
heatmap_data = pd.pivot_table(
    data=topics_over_time[topics_over_time["Topic"] != -1],  # Exclude outliers
    index="Name", 
    columns=pd.to_datetime(topics_over_time["Timestamp"]).dt.date,
    values="Frequency",
    aggfunc="sum"
).fillna(0)

# Sort topics by average frequency
heatmap_data = heatmap_data.loc[heatmap_data.mean(axis=1).sort_values(ascending=False).index]

# Create the heatmap
sns.heatmap(heatmap_data, cmap="viridis", annot=False, fmt=".2f", linewidths=0.5, cbar_kws={'label': 'Frekuensi'})

# Format the plot
plt.title("Intensitas Topik Pembicaraan Masyarakat Terkait Calon Prabowo-Gibran dari Waktu ke Waktu", fontsize=18, fontweight='bold')
plt.xlabel("Tanggal", fontsize=14)
plt.ylabel("Tema Kampanye", fontsize=14)
plt.xticks(fontsize=10, rotation=45)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Divide the campaign period into early and late periods for comparison
mid_point = sorted(df_02['date'].unique())[len(sorted(df_02['date'].unique()))//2]
early_period = df_02[df_02['date'] < mid_point]
late_period = df_02[df_02['date'] >= mid_point]

# Get topic distributions for each period
early_topics, _ = topic_model.transform(early_period['full_text'].tolist())
late_topics, _ = topic_model.transform(late_period['full_text'].tolist())

# Calculate topic frequencies for each period (excluding outliers)
early_freq = pd.Series([t for t in early_topics if t != -1]).value_counts(normalize=True)
late_freq = pd.Series([t for t in late_topics if t != -1]).value_counts(normalize=True)

# Create a comparative visualization
plt.figure(figsize=(15, 10))

# Get all topics that appear in either period
all_topics = sorted(set(early_freq.index) | set(late_freq.index))
topic_labels = [topic_model.get_topic_info().loc[topic_model.get_topic_info()["Topic"] == t, "Name"].values[0] 
                for t in all_topics]

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'Awal Kampanye': [early_freq.get(t, 0) * 100 for t in all_topics],
    'Akhir Kampanye': [late_freq.get(t, 0) * 100 for t in all_topics]
}, index=topic_labels)

# Sort by overall frequency
comparison_df = comparison_df.sort_values(by=['Awal Kampanye', 'Akhir Kampanye'], ascending=False)

# Create a horizontal bar chart
comparison_df.plot.barh(figsize=(15, 10))
plt.title('Perbandingan Frekuensi Topik: Awal vs. Akhir Kampanye', fontsize=16, fontweight='bold')
plt.xlabel('Persentase (%)', fontsize=12)
plt.ylabel('Tema Kampanye', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()

# Print summary findings
print("\n=== RINGKASAN ANALISIS TOPIK DARI WAKTU KE WAKTU ===\n")
print(f"Periode Awal ({early_period['date'].min()} hingga {early_period['date'].max()}):")
for topic, pct in early_freq.items()[:5]:
    topic_label = topic_model.get_topic_info().loc[topic_model.get_topic_info()["Topic"] == topic, "Name"].values[0]
    print(f"- {topic_label}: {pct*100:.1f}%")

print(f"\nPeriode Akhir ({late_period['date'].min()} hingga {late_period['date'].max()}):")
for topic, pct in late_freq.items()[:5]:
    topic_label = topic_model.get_topic_info().loc[topic_model.get_topic_info()["Topic"] == topic, "Name"].values[0]
    print(f"- {topic_label}: {pct*100:.1f}%")

In [None]:
# Add month column to the dataframe
df_02['month'] = df_02['created_at'].dt.to_period('M')

# Get unique months
months = sorted(df_02['month'].unique())

# Dictionary to store results
monthly_topics = {}

# Analyze topics for each month
for month in months:
    # Filter data for the month
    month_data = df_02[df_02['month'] == month]
    
    # Transform the texts for this month
    month_docs = month_data['full_text'].tolist()
    month_topics, _ = topic_model.transform(month_docs)
    
    # Get topic frequencies (excluding outliers)
    topic_freq = pd.Series([t for t in month_topics if t != -1]).value_counts()
    
    # Get top 5 topics
    top_5_topics = []
    for topic_id in topic_freq.head(5).index:
        topic_label = topic_model.get_topic_info().loc[
            topic_model.get_topic_info()["Topic"] == topic_id, "Name"].values[0]
        frequency = topic_freq[topic_id] / len(month_topics) * 100
        top_5_topics.append((topic_label, frequency))
    
    monthly_topics[month] = top_5_topics

# Visualize results
plt.figure(figsize=(15, 8 * len(months)))

for idx, (month, topics) in enumerate(monthly_topics.items()):
    plt.subplot(len(months), 1, idx + 1)
    
    topic_labels = [t[0] for t in topics]
    frequencies = [t[1] for t in topics]
    
    plt.barh(topic_labels, frequencies)
    plt.title(f'Top 5 Topics for {month}', fontsize=14, fontweight='bold')
    plt.xlabel('Frequency (%)')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    
plt.tight_layout()
plt.show()

# Print detailed results
print("\n=== ANALISIS TOPIK PEMBICARAAN MASYARAKAT TERKAIT CALON PRABOWO-GIBRAN ===\n")
for month, topics in monthly_topics.items():
    print(f"\nBulan: {month}")
    print("-" * 50)
    for idx, (topic, freq) in enumerate(topics, 1):
        print(f"{idx}. {topic}: {freq:.1f}%")