### 🚀 Task 1: Load Dataset

In [5]:
import pandas as pd
import nltk
nltk.download("stopwords")

# Load the Biden tweets dataset
dataset_path = "biden.csv"
df = pd.read_csv(dataset_path)

# Display the dataset
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rajubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,created_at,tweet
0,2020-11-03 23:41:02,#ElectionNight #MSNBC2020 #IVoted #Biden2020 #...
1,2020-11-04 09:10:42,Go Headlines: #TopNews Of The Hour\n#USElectio...
2,2020-10-18 13:25:15,I doubt the person(s) who stole our official B...
3,2020-10-21 09:30:52,The Bidens are safe so long as Fox News is the...
4,2020-11-07 17:58:18,Since I live in a republican state TIME FOR ME...


### 🚀 Task 2: Preprocessing the Text

In [6]:
import spacy
import re
from nltk.corpus import stopwords

# Load Spacy Model
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"@\w+|#\w+", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation & special characters
    doc = nlp(text)  # Tokenize & Lemmatize
    words = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
    return " ".join(words)

# Apply preprocessing
df["processed_text"] = df["tweet"].astype(str).apply(preprocess_text)

# Display cleaned data
df[["tweet", "processed_text"]]

KeyboardInterrupt: 

### 🚀 Task 3: Train a Normal LDA with K=30

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Convert text into a document-term matrix (DTM)
vectorizer = CountVectorizer(max_features=10000)
X = vectorizer.fit_transform(df["processed_text"])

# Train LDA model
lda = LatentDirichletAllocation(n_components=30, max_iter=10, random_state=42)
lda.fit(X)

# Display topics
words = vectorizer.get_feature_names_out()
topics = {}
for topic_idx, topic in enumerate(lda.components_):
    top_words = [words[i] for i in topic.argsort()[:-11:-1]]
    topics[f"Topic {topic_idx+1}"] = top_words

# Show topics
import ace_tools as tools
tools.display_dataframe_to_user(name="LDA Topics", dataframe=pd.DataFrame(topics))

print("✅ LDA model trained with K=30 topics!")


### 🚀 Task 4: Train a RollingLDA

In [None]:
from ttta.methods.rolling_lda import RollingLDA
import numpy as np

# Convert timestamps to datetime
df["created_at"] = pd.to_datetime(df["created_at"])

# Define time chunks (3-day intervals)
df["time_chunk"] = df["created_at"].dt.floor("3D")

# Prepare text data for RollingLDA
texts = df.groupby("time_chunk")["processed_text"].apply(lambda x: " ".join(x)).tolist()

# Convert text into document-term matrix
vectorizer = CountVectorizer(max_features=10000)
X_chunks = vectorizer.fit_transform(texts)

# Train RollingLDA
rolling_lda = RollingLDA(n_topics=30, prototype=1, n_iter=100)  # Reduce epochs if slow
rolling_lda.fit(X_chunks)

# Display topics
words = vectorizer.get_feature_names_out()
rolling_topics = {}
for topic_idx, topic in enumerate(rolling_lda.components_):
    top_words = [words[i] for i in topic.argsort()[:-11:-1]]
    rolling_topics[f"Topic {topic_idx+1}"] = top_words

# Show RollingLDA topics
tools.display_dataframe_to_user(name="RollingLDA Topics", dataframe=pd.DataFrame(rolling_topics))

print("✅ RollingLDA trained successfully with K=30 topics and 3-day time chunks!")


### 🚀 Task 5: Compare Evolution of Topics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Function to extract topic distributions over time
def get_topic_distribution(rolling_lda, X_chunks):
    topic_distributions = rolling_lda.transform(X_chunks)
    return topic_distributions

# Extract topic distributions
topic_distributions = get_topic_distribution(rolling_lda, X_chunks)

# Convert to DataFrame
df_topic_evolution = pd.DataFrame(topic_distributions, columns=[f"Topic {i+1}" for i in range(30)])

# Add time chunk labels
df_topic_evolution["Time Chunk"] = df["time_chunk"].unique()

# Plot topic evolution
plt.figure(figsize=(12, 6))
sns.heatmap(df_topic_evolution.set_index("Time Chunk").T, cmap="Blues", linewidths=0.5)
plt.title("Topic Evolution Over Time (RollingLDA)")
plt.xlabel("Time Chunk")
plt.ylabel("Topics")
plt.show()

print("✅ Topic evolution comparison completed!")
