# **Setup**

## **Install Modules**

In [None]:
!pip install -q kaggle
!pip install -q csvkit
!pip install -q pyLDAvis
!pip install -q vaderSentiment
!pip install -q fuzzywuzzy

## **Connect to Google drive**

In [None]:
from google.colab import files
files.upload()

In [5]:
! mkdir ~/.kaggle

In [None]:
! cp kaggle.json ~/.kaggle/

In [7]:
! chmod 600 ~/.kaggle/kaggle.json

chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [None]:
! kaggle datasets download -d asaniczka/public-opinion-on-republicans-daily-updated

In [None]:
! mkdir republicans
! unzip /content/public-opinion-on-republicans-daily-updated.zip -d /content/republicans

## **Import modules**

In [None]:
# Mount google drive directory
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# Standard library imports
import re
import pickle
from datetime import datetime

# Related third party imports
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import spacy
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from fuzzywuzzy import fuzz

# nltk specific downloads
nltk.data.path.append('/content/')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

# nltk specific imports
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

# Local application/library specific imports
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from transformers import pipeline
from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

pd.set_option('display.max_colwidth', None)

In [None]:
PATH = ''
DATASETS_PATH = PATH + 'datasets/'
MODELS = PATH + 'models/'

## **Load Dataset**

In [None]:
# Load dataset
df = pd.read_csv('/content/republicans/reddit_opinion_republican.csv')

# Display the first few rows of the dataset
print(df.head())
print(df.info())
print(df.describe())

In [14]:
# Print the total number of rows
print("Total number of rows:", df.shape[0])

Total number of rows: 5019642


# **Data Exploration**

### **Check missing values**

In [15]:
df.isna().sum()

Unnamed: 0,0
comment_id,0
score,0
self_text,33
subreddit,0
created_time,0
post_id,0
author_name,0
controversiality,0
ups,0
downs,0


### **Check duplicates**

In [16]:
df.duplicated().sum()

0

### **Create a datframe that takes data uptil November 5th, 2024 00:00:00**

In [17]:
print(df.shape[0])

# Convert 'created_time' column to datetime objects
df['created_time'] = pd.to_datetime(df['created_time'])

# Filter data
pre_elections_df = df[df['created_time'] <= '2024-11-04 23:59:59']

# Print the total number of rows in the filtered DataFrame
print("Total number of rows in the filtered DataFrame:", pre_elections_df.shape[0])

# Print first few entries of filtered dataframe
pre_elections_df.head()

5019642
Total number of rows in the filtered DataFrame: 4366187


Unnamed: 0,comment_id,score,self_text,subreddit,created_time,post_id,author_name,controversiality,ups,downs,...,user_link_karma,user_comment_karma,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received,post_created_time
653455,lvfs88y,1,Guy doesn’t know what real work is. Spoiled little bully,PoliticalHumor,2024-11-04 23:59:59,1gjmqhm,PerpetualFarter,0,1,0,...,1912.0,18548.0,21359.0,1028,,"Trump's Garbage truck door dilemma: three tries, one handle, zero dignity",0.98,1028,0,2024-11-04 19:22:41
653456,lvfs7mf,71,Anyone who’s “undecided” is voting Trump and not admitting it because psycho leftists will ruin their lives,ShitPoliticsSays,2024-11-04 23:59:54,1gjs4nv,burntbridges20,0,71,0,...,4169.0,52691.0,58238.0,275,,"""Woman I've been dating said she's undecided about Trump v. Harris. I feel sick."" Entire thread is a gold mine, literally 100%.",0.95,275,0,2024-11-04 23:07:25
653457,lvfs7f7,4,My great aunt was one of the last generations before the polio vaccine was available. She got it as a kid and was confined to a wheelchair for the rest of her life. Fuck these conservative ghouls.,politics,2024-11-04 23:59:52,1gjlocw,henrythe13th,0,4,0,...,5.0,32044.0,32057.0,210,,Trump indicates he is open to RFK Jr’s proposal to ban vaccines if elected,0.91,210,0,2024-11-04 18:39:35
653458,lvfs78b,13,"People don't remember. No one was safe in the third Reich, it wasn't just Jewish people. \n\nIf you were gay, concentration camp. \n\nIf you were a woman, off to the Aryan breeding programs. \n\nIf you were a male nazi with brown eyes, welcome to the Eastern front. \n\nIf you were a Hitler youth, go defend Berlin with a slingshot. \n\nIf you were an Aryan Uber mensch, congratulations, you get PTSD from atrocities Hitler commands you to commit while he figures out how to label you a traitor. \n\nIf youre Hitler's top general, welcome to the impossible campaign. We expect you to beat all of Russia with three teenagers and a luger. \n\nIf youre Hitlers wife, please eat this cyanide to prove your devotion to hitler. \n\nIf youre hitler, you will be shot by hitler himself!\n\nIn fascism, everybody suffers. Everyone. There is no one better off than they were before it started. Believing anything else is tantamount to political blindness.",politics,2024-11-04 23:59:50,1gjmmjo,MountainMan2_,0,13,0,...,6430.0,147382.0,154418.0,15657,,Harris leading Trump by 34 points among Latino voters in Pennsylvania: Survey,0.96,15657,0,2024-11-04 19:18:11
653459,lvfs739,70,This needs to be the law of the land. All 50 states.,Conservative,2024-11-04 23:59:48,1gjrp4i,Dazzling_Pink9751,0,70,0,...,1.0,9058.0,9059.0,661,,BREAKING: Georgia Supreme Court gives BIG victory to Republicans on election day voting,0.72,661,0,2024-11-04 22:48:22


In [None]:
csv_file = DATASETS_PATH + "republicans-basic.csv"

pre_elections_df.to_csv(csv_file, index=False)

### **Get the count of unique subreddits**

In [None]:
unique_subreddit = pre_elections_df['subreddit'].unique()
print('Unique Subreddits: ', unique_subreddit)

subreddit_counts = pre_elections_df['subreddit'].value_counts()
print('Count of Subreddits: \n', subreddit_counts)

# Create a bar plot
plt.figure(figsize=(12, 6))
subreddit_counts.plot(kind='bar')

# Customize the plot
plt.title('Subreddit Counts')
plt.xlabel('Subreddit')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Show the plot
plt.show()

### **Subreddit count in hundreds**

In [None]:
subreddit_counts_in_hundreds = subreddit_counts / 100

# Create a bar plot
plt.figure(figsize=(12, 6))
subreddit_counts_in_hundreds[:10].plot(kind='bar', color='skyblue')

# Customize the plot
plt.title('Top 10 Comment Counts (in Hundreds) Based on Subreddits')
plt.xlabel('Subreddit')
plt.ylabel('Count (hundreds)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Show the plot
plt.show()

### **Create a dataframe that only has data from the 'politics' subreddit uptil November 5th, 2024**

### Filter rows sourced from "r/politics" subreddit

In [None]:
politics_df = pre_elections_df[pre_elections_df['subreddit'] == 'politics']
politics_df.head()

### **Create a dataframe that only has data from the 'politics' subreddit from 2024**

In [None]:
pre_elections_2024_df = politics_df[politics_df['created_time'] >= '2024-01-01 00:00:00']

In [None]:
# Group by month and count the comments
pre_elections_2024_df['created_month'] = pre_elections_2024_df['created_time'].dt.to_period('M')
print(pre_elections_2024_df["created_month"])

monthly_comment_count = pre_elections_2024_df.groupby('created_month').size().reset_index(name='comment_count')
print(monthly_comment_count)

print(monthly_comment_count.info())

# Convert 'created_month' to datetime for plotting
monthly_comment_count['created_month'] = monthly_comment_count['created_month'].dt.to_timestamp()

# Plot the number of comments per month
plt.figure(figsize=(12, 6))
plt.plot(monthly_comment_count['created_month'], monthly_comment_count['comment_count'], \
         label='Number of Comments', marker='o', color='skyblue', linestyle='-')
plt.xlabel("Month")
plt.ylabel("Number of Comments")
plt.title("Monthly Comment Count for Republican-Related Comments")
plt.show()

In [None]:
# Find the day of highest engagement in 2024-08
monthly_rep_df = pre_elections_2024_df[pre_elections_2024_df['created_month'] == "2024-08"]
print(monthly_rep_df["created_time"].tail())
print(monthly_rep_df["created_time"].head())

### **Most Engaging Day of The Most Engaging Month**

In [None]:
# Group by day and count the comments
daily_comment_count = monthly_rep_df.groupby(monthly_rep_df['created_time'].dt.day).size().reset_index(name='comment_count')

# Rename the 'created_time' column to 'day'
daily_comment_count = daily_comment_count.rename(columns={'created_time': 'day'})

# Create a bar plot of daily counts
plt.figure(figsize=(12, 6))
plt.bar(daily_comment_count['day'], daily_comment_count['comment_count'], color='blue')

# Customize the plot
# Access the first value of the 'created_time' column using .iloc[0]
first_date = monthly_rep_df['created_time'].iloc[0]
plt.title(f'Daily Comment Counts for {first_date.month}-{first_date.year}')
plt.xlabel('Day of the Month')
plt.ylabel('Number of Comments')
plt.xticks(daily_comment_count['day'])  # Ensure all days are shown on the x-axis
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
max_comment_day = daily_comment_count.loc[daily_comment_count['comment_count'].idxmax(), 'day']
print("Day with maximum comment count:", max_comment_day)

# Filter the data for the specific day with the maximum comments
max_day_data = monthly_rep_df[monthly_rep_df['created_time'].dt.day == max_comment_day]

# Now max_day_data contains only the comments from the day with the highest comment count
# within the month that had the highest overall comment count.
print(f"\nComments from the day with the maximum comments in {first_date.month}:")
max_day_data.head()

In [None]:
# Group by 'post_title' and sum the 'score' for each unique title within max_day_data
title_scores = max_day_data.groupby('post_title')['score'].sum()

# Sort the title scores in descending order
title_scores_sorted = title_scores.sort_values(ascending=False)

# Create a new DataFrame with the sorted title scores, post IDs, and the sum of scores
result_df = pd.DataFrame({
    'post_title': title_scores_sorted.index,
    'total_score': title_scores_sorted.values
})

# Add the post_id to the result dataframe
post_ids = []
for title in result_df['post_title']:
    post_ids.append(max_day_data[max_day_data['post_title'] == title]['post_id'].iloc[0])
result_df['post_id'] = post_ids

result_df

In [None]:
# Sort the DataFrame by total_score in descending order and get the top 10 rows
top_10_titles = result_df.sort_values(by='total_score', ascending=False).head(10)

# Print the top 10 titles and their total scores
pd.set_option('display.max_colwidth', None)
#this will ensure that the entire title is displayed
print(top_10_titles['post_title'])

In [None]:
def trim_string(string):
    return string[:29] + '...'

top_10_titles['post_title_shortened'] = top_10_titles['post_title'].apply(trim_string)

plt.figure(figsize=(12, 6))
plt.bar(top_10_titles['post_title_shortened'], top_10_titles['total_score'])
plt.xlabel("Post Title")
plt.ylabel("Title Score")
plt.title("Top 10 Titles: Title Score vs. Post")
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Print the top 3 post titles completely
for i in range(min(3, len(result_df))):  # Handle cases where there are fewer than 3 titles
    print(result_df['post_title'][i])

# **Sentiment Exploration on non-cleaned data**

### **Handle Missing values**

In [None]:
# Convert the 'user_account_created_time' column to datetime objects
pre_elections_2024_df['user_account_created_time'] = pd.to_datetime(pre_elections_2024_df['user_account_created_time'])

# Calculate the median account age and use it to fill missing values
median_date = pre_elections_2024_df['user_account_created_time'].median()
pre_elections_2024_df['user_account_created_time'] = pre_elections_2024_df['user_account_created_time'].fillna(median_date)

# Drop these rows as the number of missing values is very insignificant in contrast to the dataset size
pre_elections_2024_df.dropna(subset=['user_awardee_karma', 'user_awarder_karma', 'user_link_karma', 'user_comment_karma', 'user_total_karma', 'self_text'], inplace=True)

pre_elections_2024_df.isna().sum()

In [None]:
# Debug statements

pre_elections_2024_df.info()
pre_elections_2024_df.describe()

In [None]:
# Debug statements

print(pre_elections_2024_df["score"].min())
print(pre_elections_2024_df["score"].max())

print(pre_elections_2024_df["ups"].min())
print(pre_elections_2024_df["ups"].max())

print(pre_elections_df.head(10))

## **Using VaderSentiment to explore the data**

In [None]:
print('Lowest Score: ', pre_elections_2024_df["score"].min())
print('Highest Score: ', pre_elections_2024_df["score"].max())

In [None]:
# Identify influential users based on karma
# sorting users by their total karma to identify influential users
influential_users = pre_elections_2024_df.groupby('author_name').agg({
    'user_total_karma': 'first',
    'user_comment_karma': 'first',
    'user_link_karma': 'first',
    'user_awardee_karma': 'first',
    'user_awarder_karma': 'first',
    'ups': 'sum',
    'downs': 'sum',
    'score': 'sum',
    'controversiality': 'mean'
}).reset_index()

# Sort influential users by total karma to focus on the most influential ones
influential_users = influential_users.sort_values(by='user_total_karma', ascending=False)

# Display the top 10 most influential users
print("Top 10 Most Influential Users in Republican Discussions:")
print(influential_users.head(10))

# Analyze Engagement Patterns for Influential Users
# Select the top 10 influential users
top_influential_users = influential_users.head(10)['author_name']

# Filter comments by these top users in the Republican-related data
top_user_comments = pre_elections_2024_df[pre_elections_2024_df['author_name'].isin(top_influential_users)]

# Calculate average engagement metrics for these top users
user_engagement_patterns = top_user_comments.groupby('author_name').agg({
    'ups': 'mean',
    'downs': 'mean',
    'score': 'mean',
    'controversiality': 'mean',
    'user_total_karma': 'first',
    'user_comment_karma': 'first'
}).reset_index()

# Visualize Engagement Patterns for Top Influential Users
# Display engagement patterns
print("\nEngagement Patterns for Top Influential Users:")
print(user_engagement_patterns)

# Bar chart for average upvotes top influential users
plt.figure(figsize=(12, 6))
plt.bar(user_engagement_patterns['author_name'], user_engagement_patterns['ups'], color='skyblue', label='Average Ups')
plt.xlabel("User")
plt.ylabel("Average Engagement")
plt.title("Average Upvotes for Top Influential Users in Republican Discussions")
plt.legend()
plt.xticks(rotation=45)
plt.show()

# Bar chart for total karma of top influential users
plt.figure(figsize=(12, 6))
plt.bar(user_engagement_patterns['author_name'], user_engagement_patterns['user_total_karma'], color='orange', label='Total karma')
plt.xlabel("User")
plt.ylabel("Total karma")
plt.title("Total karma of Top Influential Users in Republican Discussions")
plt.legend()
plt.xticks(rotation=45)
plt.show()

# Bar chart for total comment karma of top influential users
plt.figure(figsize=(12, 6))
plt.bar(user_engagement_patterns['author_name'], user_engagement_patterns['user_comment_karma'], color='green', label='Comment karma')
plt.xlabel("User")
plt.ylabel("Comment karma")
plt.title("Total comment karma of Top Influential Users in Republican Discussions")
plt.legend()
plt.xticks(rotation=45)
plt.show()


In [None]:
# Calculate account age in days
current_date = datetime.now()

pre_elections_2024_df['user_account_created_time'] = pd.to_datetime(pre_elections_2024_df['user_account_created_time'])
pre_elections_2024_df['account_age_days'] = pre_elections_2024_df['user_account_created_time'].apply(lambda x: (current_date - x).days)

# Perform Sentiment Analysis
# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to calculate sentiment score
def get_sentiment(text):
    if pd.notnull(text):
        sentiment_score = analyzer.polarity_scores(text)['compound']  # compound score for overall sentiment
        return sentiment_score
    return 0

# Apply sentiment analysis to the comments about Republicans
# Sentiment analysis part has been commented out to avoid rerunning the entire process
pre_elections_2024_df['sentiment_score'] = pre_elections_2024_df['self_text'].apply(get_sentiment)

# Explore the Correlation Between Account Age and Sentiment Score
# Drop any rows where 'sentiment_score' or 'account_age_days' is missing
pre_elections_2024_df.dropna(subset=['sentiment_score', 'account_age_days'], inplace=True)

# Calculate Pearson correlation coefficient
correlation, p_value = pearsonr(pre_elections_2024_df['account_age_days'], pre_elections_2024_df['sentiment_score'])

# Display results
print(f"Correlation between account age and sentiment score: {correlation:.4f}")
print(f"P-value of the correlation: {p_value:.4f}")

# Distribution of account age
plt.figure(figsize=(10, 6))
sns.histplot(pre_elections_2024_df['account_age_days'], bins=10, kde=True, color='skyblue', label='Account age')
plt.xlabel("User account age")
plt.ylabel("Count")
plt.title("User account age distribution")
plt.legend()
plt.show()

# Distribution of Sentiments
plt.figure(figsize=(10, 6))
sns.histplot(pre_elections_2024_df['sentiment_score'], bins=10, kde=True, color='brown', label='Sentiment score')
plt.xlabel("Sentiment scores")
plt.ylabel("Count")
plt.title("Sentiment score distribution")
plt.legend()
plt.show()

# Visualize the Correlation
plt.figure(figsize=(10, 6))
plt.scatter(pre_elections_2024_df['account_age_days'], pre_elections_2024_df['sentiment_score'], alpha=0.6, color='blue')
plt.xlabel("Account Age (Days)")
plt.ylabel("Sentiment Score")
plt.title("Correlation Between User Account Age and Sentiment Score Toward Republicans")
plt.grid(True)
plt.show()


In [None]:
del df, politics_df, monthly_rep_df, pre_elections_df

# **Data Cleaning**

## **Cleaning the textual data**

In [None]:
# Create Analysis DataFrame

analysis_df = pre_elections_2024_df

analysis_df

In [None]:
del pre_elections_2024_df

In [None]:
# Savepoint
analysis_df.to_csv(DATASETS_PATH + "/republicans-politics-2024-cleaned.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-politics-2024-cleaned.csv")

In [None]:
# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Step 1: Convert to string if not already
    if not isinstance(text, str):
        text = str(text)

    # Step 2: Remove markdown and URLs
    text = re.sub(r'\!\[.*?\]\(.*?\)|https?://\S+|www\.\S+', '', text)

    # Step 3: Remove extra line breaks and normalize whitespace
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Step 4: Remove HTML encodings like '&gt;' and '&amp;'
    text = re.sub(r"&\w+;", "", text)

    return text

analysis_df['clean_text'] = analysis_df['self_text'].apply(preprocess_text)

In [None]:
analysis_df[['self_text', 'clean_text']]

In [None]:
# Savepoint
analysis_df.to_csv(DATASETS_PATH + "/republicans-politics-2024-cleaned.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-politics-2024-cleaned.csv")

## **Using VADER for Sentiment Polarity**

In [None]:
# Function to calculate sentiment score
def get_sentiment_polarity(text):
    if pd.notnull(text):
        sentiment_score = analyzer.polarity_scores(text)['compound']  # compound score for overall sentiment
        return sentiment_score
    return 0

# Apply sentiment analysis to the comments about Republicans
analysis_df['vader_sentiment_score'] = analysis_df['clean_text'].apply(get_sentiment_polarity)

In [None]:
analysis_df['vader_sentiment_score']

In [None]:
# Savepoint
analysis_df.to_csv(DATASETS_PATH + "/republicans-politics-2024-vader.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-politics-2024-vader.csv")

## **Using TextBlob for Sentiment Polarity**

In [None]:
from textblob import TextBlob

# Convert 'clean_text' column to string type before applying TextBlob
analysis_df['clean_text'] = analysis_df['clean_text'].astype(str)

# Calculate sentiment polarity
analysis_df['textblob_sentiment_score'] = analysis_df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
analysis_df[['clean_text', 'vader_sentiment_score','textblob_sentiment_score']]

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-politics-2024-textblob.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-politics-2024-textblob.csv")

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

def tokenize_text(text):
    if not isinstance(text, str):
        return ""

    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()  # Convert to lowercase

    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

analysis_df['processed_text'] = analysis_df['clean_text'].apply(tokenize_text)

In [None]:
# Replace NaN values with empty strings before applying Vectorizer
analysis_df['processed_text'] = analysis_df['processed_text'].fillna('')

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-politics-2024-tokenized.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-politics-2024-tokenized.csv")

# **LDA for Topic Discovery**

In [None]:
def print_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print("\n")

def calculate_topic_diversity(lda_model, feature_names, top_n=10):
    topic_words = []
    for topic in lda_model.components_:
        top_words = [feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]]
        topic_words.append(set(top_words))

    n_topics = len(topic_words)
    total_pairs = n_topics * (n_topics - 1) / 2
    diversity = 0
    for i in range(n_topics):
        for j in range(i + 1, n_topics):
            diversity += len(topic_words[i].intersection(topic_words[j])) / top_n
    return 1 - (diversity / total_pairs)

def create_lda_model(n_topics, vectorizer, processed_text):

    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)

    keywords = vectorizer.fit_transform(processed_text)
    lda.fit(keywords)

    # Get the words corresponding to the topics
    feature_names = vectorizer.get_feature_names_out()

    n_top_words = 10  # Number of top words per topic
    print_topics(lda, feature_names, n_top_words)

    topic_distributions = lda.transform(keywords)  # (n_docs, n_topics)

    return lda, keywords, topic_distributions, vectorizer

def print_metrics(lda_model, keywords):
    perplexity = lda_model.perplexity(keywords)
    print(f"Perplexity: {perplexity}")

    log_likelihood = lda_model.score(keywords)
    print(f"Log-Likelihood: {log_likelihood}")

def get_silohouette_score(lda_model, topic_distributions, keywords):
    topic_distributions = lda_model.transform(keywords)  # (n_docs, n_topics)
    silhouette = silhouette_score(topic_distributions, np.argmax(topic_distributions, axis=1))
    print(f"Silhouette Score: {silhouette:.2f}")

## **CountVectorizer**

### **5 topics**

In [None]:
# Vectorize text
vectorizer = CountVectorizer(max_features=1000, stop_words='english')

In [None]:
lda, keywords, topic_distributions, vectorizer = create_lda_model(5, vectorizer, analysis_df['processed_text'])

In [None]:
print_metrics(lda, keywords)

In [None]:
# Add dominant topic to the DataFrame
analysis_df['cv5_dominant_topic'] = topic_distributions.argmax(axis=1)
analysis_df[['self_text', 'cv5_dominant_topic']]

In [None]:
# get_silohouette_score(lda, topic_distributions, keywords)

In [None]:
# topic_diversity = calculate_topic_diversity(lda, feature_names, top_n=10)
# print(f"Topic Diversity: {topic_diversity:.2f}")

In [None]:
# Save LDA model with 5 topics
LDA_5 = {"model": lda, "vectorizer": vectorizer}
pickle.dump(LDA_5, open(MODELS + "/republicans_LDA_5", 'wb'))

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-LDA-5.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-LDA-5.csv")

### **10 topics**

In [None]:
# Vectorize text
vectorizer = CountVectorizer(max_features=1000, stop_words='english')

In [None]:
lda_2, keywords_2, topic_distributions_2, vectorizer = create_lda_model(10, vectorizer, analysis_df['processed_text'])

In [None]:
print_metrics(lda_2, keywords_2)

In [None]:
# get_silohouette_score(lda_2, topic_distributions_2, keywords_2)

In [None]:
# Add dominant topic to the DataFrame
analysis_df['cv10_dominant_topic'] = topic_distributions_2.argmax(axis=1)
analysis_df[['self_text', 'cv10_dominant_topic']]

In [None]:
# Save LDA model with 10 topics
LDA_10 = {"model": lda_2, "vectorizer": vectorizer}
pickle.dump(LDA_10, open(MODELS + "/republicans_LDA_10", 'wb'))

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-LDA-10.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-LDA-10.csv")

## **TfidfVectorizer**

### **5 topics**

In [None]:
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')

In [None]:
tfidf_lda, tfidf_keywords, tfidf_topic_distributions, tfidf = create_lda_model(5, tfidf, analysis_df['processed_text'])

In [None]:
print_metrics(tfidf_lda, tfidf_keywords)

In [None]:
# get_silohouette_score(tfidf_lda, tfidf_topic_distributions, tfidf_keywords)

In [None]:
# Add dominant topic to the DataFrame
analysis_df['tf5_dominant_topic'] = tfidf_topic_distributions.argmax(axis=1)
analysis_df[['self_text', 'tf5_dominant_topic']]

In [None]:
# Save TFIDF model with 5 topics
TFIDF_5 = {"model": tfidf_lda, "vectorizer": tfidf}
pickle.dump(TFIDF_5, open(MODELS + "/republicans_TFIDF_5", 'wb'))

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-TFIDF-5.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-TFIDF-5.csv")

### **10 Topics**

In [None]:
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')

In [None]:
tfidf_lda_2, tfidf_keywords_2, tfidf_topic_distributions_2, tfidf = create_lda_model(10, tfidf, analysis_df['processed_text'])

In [None]:
print_metrics(tfidf_lda_2, tfidf_keywords_2)

In [None]:
# get_silohouette_score(tfidf_lda_2, tfidf_topic_distributions_2, tfidf_keywords_2)

In [None]:
# Add dominant topic to the DataFrame
analysis_df['tf10_dominant_topic'] = tfidf_topic_distributions_2.argmax(axis=1)
analysis_df[['self_text', 'tf10_dominant_topic']]

In [None]:
# Save TFIDF model with 10 topics
TFIDF_10 = {"model": tfidf_lda_2, "vectorizer": tfidf}
pickle.dump(TFIDF_10, open(MODELS + "/republicans_TFIDF_10", 'wb'))

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-TFIDF-10.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-TFIDF-10.csv")

# K-Means Clustering


## TfidfVectorizer

### 5 Clusters

In [None]:
# Convert text to numerical representation using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(analysis_df['processed_text'])

In [None]:
# Apply K-Means clustering
num_clusters = 5  # Define the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign cluster labels to the DataFrame
analysis_df['cluster_tf5'] = kmeans.labels_

In [None]:
# Evaluate clustering

inertia = kmeans.inertia_
print(f"Inertia: {inertia}")

kmeans_score = kmeans.score(tfidf_matrix)
print(f"Score: {kmeans_score}")

In [None]:
# Save Kmeans model with 5 topics
KMEANS_TFIDF_5 = {"model": kmeans, "vectorizer": tfidf_vectorizer}
pickle.dump(KMEANS_TFIDF_5, open(MODELS + "/republicans_Kmeans_TFIDF_5", 'wb'))

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-kmeans-tfidf-10.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-kmeans-tfidf-10.csv")

### 10 Clusters

In [None]:
# Convert text to numerical representation using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(analysis_df['processed_text'])

In [None]:
num_clusters = 10  # Define the number of clusters
kmeans10 = KMeans(n_clusters=num_clusters, random_state=42)
kmeans10.fit(tfidf_matrix)

# Assign cluster labels to the DataFrame
analysis_df['cluster_tf10'] = kmeans10.labels_

In [None]:
# Evaluate clustering

inertia = kmeans10.inertia_
print(f"Inertia: {inertia}")

kmeans10_score = kmeans10.score(tfidf_matrix)
print(f"Score: {kmeans10_score}")

In [None]:
# Save Kmeans model with 10 topics
KMEANS_TFIDF_10 = {"model": kmeans10, "vectorizer": tfidf_vectorizer}
pickle.dump(KMEANS_TFIDF_10, open(MODELS + "/republicans_Kmeans_TFIDF_10", 'wb'))

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-kmeans-tfidf-5.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-kmeans-tfidf-5.csv")

## CountVectorizer

### 5 Clusters

In [None]:
# Convert text to numerical representation using CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(analysis_df['processed_text'])

In [None]:
# Apply K-Means clustering
num_clusters = 5  # Define the number of clusters
kmeans_cv5 = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_cv5.fit(count_matrix)

# Assign cluster labels to the DataFrame
analysis_df['cluster_cv5'] = kmeans_cv5.labels_

In [None]:
inertia = kmeans_cv5.inertia_
print(f"Inertia: {inertia}")

In [None]:
# Save Kmeans model with 5 topics
KMEANS_CV_5 = {"model": kmeans_cv5, "vectorizer": count_vectorizer}
pickle.dump(KMEANS_CV_5, open(MODELS + "/republicans_Kmeans_cv_5", 'wb'))

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-kmeans-cv-5.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-kmeans-cv-5.csv")

### 10 Clusters

In [None]:
# Convert text to numerical representation using CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(analysis_df['processed_text'])

In [None]:
num_clusters = 10  # Define the number of clusters
kmeans_cv10 = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_cv10.fit(tfidf_matrix)

# Assign cluster labels to the DataFrame
analysis_df['cluster_cv10'] = kmeans_cv10.labels_

In [None]:
inertia = kmeans_cv10.inertia_
print(f"Inertia: {inertia}")

In [None]:
# Save Kmeans model with 10 topics
KMEANS_CV_10 = {"model": kmeans_cv10, "vectorizer": count_vectorizer}
pickle.dump(KMEANS_CV_10, open(MODELS + "/republicans_Kmeans_cv_10", 'wb'))

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-kmeans-cv-10.csv", index=False)

In [None]:
# Restore point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-kmeans-cv-10.csv")

In [None]:
# # Load LDA CV model with 5 topics
# lda_cv_5 = pickle.load(open(MODELS + "/republicans_LDA_5", 'rb'))

# # Load LDA CV model with 10 topics
# lda_cv_10 = pickle.load(open(MODELS + "/republicans_LDA_10", 'rb'))

# # Load LDA TFIDF model with 5 topics
# lda_tfidf_5 = pickle.load(open(MODELS + "/republicans_TFIDF_5", 'rb'))


# # Load Kmeans CV model with 5 topics
# km_cv_5 = pickle.load(open(MODELS + "/republicans_Kmeans_cv_5", 'rb'))

# # Load Kmeans CV model with 10 topics
# km_cv_10 = pickle.load(open(MODELS + "/republicans_Kmeans_cv_10", 'rb'))

# # Load Kmeans TFIDF model with 5 topics
# km_tfidf_5 = pickle.load(open(MODELS + "/republicans_Kmeans_TFIDF_5", 'rb'))

# Load LDA TFIDF model with 10 topics
lda_tfidf_10 = pickle.load(open(MODELS + "/republicans_TFIDF_10", 'rb'))

# Load Kmeans TFIDF model with 10 topics
km_tfidf_10 = pickle.load(open(MODELS + "/republicans_Kmeans_TFIDF_10", 'rb'))

# **Search Functionality**

In [None]:
# Function to extract top words from LDA topics
def extract_lda_topics(lda_model, vectorizer, n_top_words=10):
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    for idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_indices]
        topic_keywords[idx] = top_words
    return topic_keywords

# Function to extract top terms for K-Means clusters
def extract_kmeans_clusters(kmeans_model, tfidf_vectorizer, n_top_words=10):
    cluster_centroids = kmeans_model.cluster_centers_
    feature_names = tfidf_vectorizer.get_feature_names_out()

    cluster_keywords = {}
    for idx, centroid in enumerate(cluster_centroids):
        top_indices = centroid.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_indices]
        cluster_keywords[idx] = top_words
    return cluster_keywords

# Synonym expansion using WordNet
def get_synonyms(query):
    synonyms = set()
    for synset in wn.synsets(query):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name().lower())
    return list(synonyms)

# Fuzzy matching for terms
def fuzzy_match(query, keywords, threshold=80):
    matches = []
    for word in keywords:
        if fuzz.ratio(query.lower(), word.lower()) >= threshold:
            matches.append(word)
    return matches

# Search function for both models
def search_models(query, lda_model, lda_vectorizer, kmeans_model, tfidf_vectorizer, n_top_words=10, threshold=80):
    """
    Search for a query across both LDA and K-Means models and return combined results.
    """
    results = {"LDA": {}, "K-Means": {}}

    # Extract LDA topics and search
    lda_keywords = extract_lda_topics(lda_model, lda_vectorizer, n_top_words)
    lda_matches = search_topics_with_synonyms_and_fuzzy(query, lda_keywords, threshold)
    results["LDA"] = lda_matches

    # Extract K-Means clusters and search
    kmeans_keywords = extract_kmeans_clusters(kmeans_model, tfidf_vectorizer, n_top_words)
    kmeans_matches = search_topics_with_synonyms_and_fuzzy(query, kmeans_keywords, threshold)
    results["K-Means"] = kmeans_matches

    return results

# Unified search logic
def search_topics_with_synonyms_and_fuzzy(query, topic_keywords, threshold=80):
    synonyms = get_synonyms(query)
    all_search_terms = [query] + synonyms

    matching_topics = {}

    for word in all_search_terms:
        for topic_idx, keywords in topic_keywords.items():
            # Perform both exact match and fuzzy match
            exact_matches = [w for w in keywords if word == w]
            fuzzy_matches = fuzzy_match(word, keywords, threshold)

            if exact_matches or fuzzy_matches:
                if topic_idx not in matching_topics:
                    matching_topics[topic_idx] = set()
                matching_topics[topic_idx].update(exact_matches + fuzzy_matches)

    return matching_topics

# Main invocation
query = input("Enter a word to search: ")

# LDA and K-Means setup (Assume models are already trained)
lda_model, lda_vectorizer = lda_tfidf_10["model"], lda_tfidf_10["vectorizer"]
kmeans_model, tfidf_vectorizer = km_tfidf_10["model"], km_tfidf_10["vectorizer"]

# Search across both models
search_results = search_models(query, lda_model, lda_vectorizer, kmeans_model, tfidf_vectorizer)

# Display results
if any(search_results["LDA"]) or any(search_results["K-Means"]):
    print(f"Results for '{query}':\n")

    # LDA Results
    if search_results["LDA"]:
        print("LDA Results:")
        for topic_idx, matched_keywords in search_results["LDA"].items():
            print(f"  Topic {topic_idx + 1}: {list(matched_keywords)}")
    else:
        print("No matches found in LDA.")

    # K-Means Results
    if search_results["K-Means"]:
        print("\nK-Means Results:")
        for cluster_idx, matched_keywords in search_results["K-Means"].items():
            print(f"  Cluster {cluster_idx + 1}: {list(matched_keywords)}")
    else:
        print("No matches found in K-Means.")
else:
    print(f"No matches found for '{query}' in either model.")


# **Qualitative Analysis**

In [None]:
topic_distribution = analysis_df['tf5_dominant_topic'].value_counts()
topic_distribution

In [None]:
# Engagement Analysis
engagement_stats = analysis_df.groupby('tf5_dominant_topic')[['score', 'ups', 'downs']].mean()

# Time-based Analysis
analysis_df['post_created_time'] = pd.to_datetime(analysis_df['post_created_time'])
analysis_df['hour'] = analysis_df['post_created_time'].dt.hour
hourly_activity = analysis_df.groupby('hour').size()


In [None]:
sns.histplot(analysis_df['vader_sentiment_score'], kde=True)
plt.title("Sentiment Distribution")
plt.show()


In [None]:
analysis_df.groupby('tf5_dominant_topic')['vader_sentiment_score'].mean().sort_values().plot(kind='barh')
plt.title("Average Sentiment Score by VADER")
plt.show()


In [None]:
analysis_df.groupby('tf5_dominant_topic')['textblob_sentiment_score'].mean().sort_values().plot(kind='barh')
plt.title("Average Sentiment Score by Textblob")
plt.show()

In [None]:
# Function to calculate sentiment score
def get_sentiment(sentiment_score):
  if sentiment_score >= 0.0765:    # Best decision boundary = 0.0765
    return "positive"
  elif sentiment_score <= -0.0765:
    return "negative"
  else:
    return "neutral"

analysis_df['vader_sentiment'] = analysis_df['vader_sentiment_score'].apply(get_sentiment)

In [None]:
# sns.histplot(analysis_df['vader_sentiment'], kde=True)
analysis_df.groupby('tf5_dominant_topic')['vader_sentiment'].value_counts().plot(kind='bar')
# analysis_df.groupby('vader_sentiment')['dominant_topic'].value_counts().plot(kind='barh')
# analysis_df[analysis_df['dominant_topic'] == 0]['vader_sentiment'].value_counts().plot(kind='bar')
plt.title("Sentiment Distribution")
plt.show()

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-politics-2024-vader-2.csv", index=False)

In [None]:
# Recover point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-politics-2024-vader-2.csv")

In [None]:
# Create the multi-bar graph
ax = analysis_df.groupby('tf5_dominant_topic')['vader_sentiment'].value_counts().unstack().plot(kind='bar')
plt.title('Sentiment Distribution across Dominant Topics')
plt.xlabel('Dominant Topic')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')

plt.tight_layout()
plt.show()

In [None]:
hourly_activity.plot(kind='line')
plt.title("Posting Activity by Hour")
plt.xlabel("Hour")
plt.ylabel("Number of Posts")
plt.show()

In [None]:
analysis_df['post_created_time'] = pd.to_datetime(analysis_df['post_created_time'])
analysis_df['time_from_election_day'] = round((pd.to_datetime('2024-11-05 00:00:00') - analysis_df['post_created_time']).dt.days / 30, 0)
analysis_df['time_from_election_day'] = analysis_df['time_from_election_day'].astype(int)

In [None]:
analysis_df.groupby('time_from_election_day')['vader_sentiment'].value_counts().nlargest(50).unstack().plot(kind='bar')
plt.title("Sentiment Distribution by Time from Election Day")
plt.xlabel("Months Before from Election Day")
plt.ylabel("Count")
plt.figure(figsize=(15, 10))
plt.show()

In [None]:
for i in range(5):
  analysis_df[analysis_df['tf5_dominant_topic'] == i].groupby('time_from_election_day')['vader_sentiment'].value_counts().nlargest(50).unstack().plot(kind='bar')
  plt.title(f"Topic {i+1}: Sentiment Distribution by Time from Election Day")
  plt.xlabel("Months Before from Election Day")
  plt.ylabel("Count")
  plt.show()

In [None]:
analysis_df.groupby('tf5_dominant_topic')['vader_sentiment_score'].mean().sort_values().plot(kind='barh')
plt.title("Topic-wise Average Sentiment Score")
plt.xlabel("Average Sentiment Score")
plt.ylabel("Topic")
plt.show()

In [None]:
# Multi-bar graph
ax = analysis_df.groupby('tf5_dominant_topic')['vader_sentiment'].value_counts().unstack().plot(kind='bar')
plt.title('Sentiment Distribution across Dominant Topics')
plt.xlabel('Dominant Topic')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')

plt.tight_layout()
plt.show()

In [None]:
# Save point
analysis_df.to_csv(DATASETS_PATH + "/republicans-politics-2024-qual.csv", index=False)

In [None]:
# Recover point
analysis_df = pd.read_csv(DATASETS_PATH + "/republicans-politics-2024-qual.csv")

# Most Commonly Used Phrases Across Topics

In [None]:
# define analysis window for month august
engaging_month = analysis_df[(analysis_df['created_month'] == '2024-08')]

# Function to get most common phrases
def get_most_common_phrases(comments, ngram_range, top_n):
    comments = [str(comment) if not pd.isnull(comment) else '' for comment in comments]
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
    bag_of_words = vectorizer.fit_transform(comments)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

# Most common phrases per topic
topic_phrases = {}
for topic in engaging_month["tf5_dominant_topic"].unique():
    topic_comments = engaging_month[engaging_month["tf5_dominant_topic"] == topic]["clean_text"].tolist()
    topic_phrases[topic] = get_most_common_phrases(topic_comments, ngram_range=(3, 5), top_n=10)

# Print most common phrases for each topic
for topic, phrases in topic_phrases.items():
    print(f"\nTopic {topic} Most Common Phrases:")
    for phrase, freq in phrases:
        print(f"{phrase}:{freq}")