In [None]:
%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install seaborn
%pip install nltk
%pip install scikit-learn
%pip install wordcloud
%pip install geopy
%pip install folium
%pip install textblob

In [None]:
%pip install pandas numpy matplotlib seaborn nltk scikit-learn wordcloud geopy folium textblob spacy
%python -m spacy download en_core_web_sm

In [None]:
# import necessary libraries for data handling and analysis
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from textblob import TextBlob
import folium
from folium.plugins import HeatMap
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

# load trump tweets data into a pandas dataframe
df_trump = pd.read_csv('data/hashtag_donaldtrump.csv', engine='python', encoding='utf-8', on_bad_lines='skip')

# load biden tweets data into a pandas dataframe
df_biden = pd.read_csv('data/hashtag_joebiden.csv', engine='python', encoding='utf-8', on_bad_lines='skip')

# check the first few rows of trump data to understand its structure
print("Trump tweets data preview:")
print(df_trump.head())

# check the first few rows of biden data to understand its structure
print("\nBiden tweets data preview:")
print(df_biden.head())

In [None]:
# drop unnecessary columns that won't be used in analysis
columns_to_drop = ['user_id', 'tweet_id', 'lat', 'long', 'source']
df_trump = df_trump.drop(columns=columns_to_drop, errors='ignore')  # drop columns from trump data
df_biden = df_biden.drop(columns=columns_to_drop, errors='ignore')  # drop columns from biden data

# convert created_at and collected_at columns to datetime format for both datasets
df_trump['created_at'] = pd.to_datetime(df_trump['created_at'], errors='coerce')  # trump tweet timestamp
df_trump['collected_at'] = pd.to_datetime(df_trump['collected_at'], errors='coerce')  # trump data collection timestamp
df_biden['created_at'] = pd.to_datetime(df_biden['created_at'], errors='coerce')  # biden tweet timestamp
df_biden['collected_at'] = pd.to_datetime(df_biden['collected_at'], errors='coerce')  # biden data collection timestamp

# drop rows where the tweet content is missing
df_trump = df_trump.dropna(subset=['tweet'])  # drop missing tweets in trump data
df_biden = df_biden.dropna(subset=['tweet'])  # drop missing tweets in biden data

# drop rows where the state is missing
df_trump = df_trump.dropna(subset=['state'])  # drop missing states in trump data
df_biden = df_biden.dropna(subset=['state'])  # drop missing states in biden data

# function to clean tweet text by removing URLs, special characters, and standardizing case
def clean_tweet(text):  # define function for tweet text cleaning
    text = re.sub(r'http\S+|www.\S+', '', text)  # remove urls
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r'#', '', text)  # remove hashtag symbol
    text = re.sub(r'\n', ' ', text)  # replace newline chars w spaces
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)  # remove special chars
    return text.lower()  # convert text to lowercase

# apply the clean_tweet function to trump tweets
df_trump['cleaned_tweet'] = df_trump['tweet'].apply(clean_tweet)  # cleaned trump tweets

# apply the clean_tweet function to biden tweets
df_biden['cleaned_tweet'] = df_biden['tweet'].apply(clean_tweet)  # cleaned biden tweets

# text lemmatization
nlp = spacy.load("en_core_web_sm")
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# classify sentiment
def classify_sentiment(score):
    if score > 0.1:
        return "positive"
    elif score < -0.1:
        return "negative"
    else:
        return "neutral"

# Filter tweets between August and November
start_date = '2020-08-01'
end_date = '2020-11-30'
df_trump['created_at'] = pd.to_datetime(df_trump['created_at'], errors='coerce')
df_biden['created_at'] = pd.to_datetime(df_biden['created_at'], errors='coerce')
df_trump = df_trump[(df_trump['created_at'] >= start_date) & (df_trump['created_at'] <= end_date)]
df_biden = df_biden[(df_biden['created_at'] >= start_date) & (df_biden['created_at'] <= end_date)]

# Clean and lemmatize tweets
df_trump['cleaned_tweet'] = df_trump['tweet'].apply(clean_tweet).apply(lemmatize_text)
df_biden['cleaned_tweet'] = df_biden['tweet'].apply(clean_tweet).apply(lemmatize_text)

# Sentiment analysis
df_trump['sentiment_score'] = df_trump['cleaned_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
df_biden['sentiment_score'] = df_biden['cleaned_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
df_trump['sentiment'] = df_trump['sentiment_score'].apply(classify_sentiment)
df_biden['sentiment'] = df_biden['sentiment_score'].apply(classify_sentiment)


In [None]:
# eda
# check basic info for trump dataset to understand data types and missing values
print("\nTrump data info:")
print(df_trump.info())

# check basic info for biden dataset to understand data types and missing values
print("\nBiden data info:")
print(df_biden.info())

# calculate the distribution of tweet lengths in the trump data
df_trump['tweet_length'] = df_trump['cleaned_tweet'].apply(len)  # add tweet length column for trump tweets
print("\nTrump tweet length statistics:")
print(df_trump['tweet_length'].describe())  # show summary statistics for tweet length

# calculate the distribution of tweet lengths in the biden data
df_biden['tweet_length'] = df_biden['cleaned_tweet'].apply(len)  # add tweet length column for biden tweets
print("\nBiden tweet length statistics:")
print(df_biden['tweet_length'].describe())  # show summary statistics for tweet length


Trump data info:
<class 'pandas.core.frame.DataFrame'>
Index: 971073 entries, 0 to 971086
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   created_at            970919 non-null  datetime64[ns]
 1   tweet                 971073 non-null  object        
 2   likes                 971045 non-null  object        
 3   retweet_count         970933 non-null  float64       
 4   user_name             970911 non-null  object        
 5   user_screen_name      970933 non-null  object        
 6   user_description      869661 non-null  object        
 7   user_join_date        970779 non-null  object        
 8   user_followers_count  970917 non-null  object        
 9   user_location         675830 non-null  object        
 10  city                  227180 non-null  object        
 11  country               442732 non-null  object        
 12  continent             442749 non-null  object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# combine trump and biden tweets for vectorization
tweets = pd.concat([df_trump['cleaned_tweet'], df_biden['cleaned_tweet']], ignore_index=True)  # merged tweet data

# initialize tf-idf vectorizer w specified parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=10, stop_words='english', max_features=1000)  # configure tf-idf

# fit tf-idf vectorizer and transform tweets to create tf-idf matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(tweets)  # generate tf-idf matrix from tweet data

# check shape of tf-idf matrix to confirm dimensions
print("TF-IDF matrix shape:", tfidf_matrix.shape)  # print dimensions of tf-idf matrix


TF-IDF matrix shape: (1748068, 1000)


In [None]:
from sklearn.cluster import KMeans

num_clusters = 5  # set number of clusters for k-means

# init k-means clustering w specified number of clusters and random state
kmeans = KMeans(n_clusters=num_clusters, random_state=42)  # init k-means w fixed clusters

# fit k-means on the tf-idf matrix to find clusters in tweet data
kmeans.fit(tfidf_matrix)  # train k-means clustering on tf-idf matrix

# assign each tweet to a cluster based on fitted model
tweets_clusters = kmeans.labels_  # array of cluster assignments for each tweet

# add cluster labels back to trump and biden dataframes
df_trump['cluster'] = tweets_clusters[:len(df_trump)]  # assign clusters to trump tweets
df_biden['cluster'] = tweets_clusters[len(df_trump):]  # assign clusters to biden tweets

# display cluster labels for a sample of trump tweets
print("\nTrump tweets with cluster labels:")
print(df_trump[['cleaned_tweet', 'cluster']].head())  # sample trump tweets w clusters

# display cluster labels for a sample of biden tweets
print("\nBiden tweets with cluster labels:")
print(df_biden[['cleaned_tweet', 'cluster']].head())  # sample biden tweets w clusters



Trump tweets with cluster labels:
                                       cleaned_tweet  cluster
0  elecciones2020  en florida joebiden dice que d...        4
1  usa 2020 trump contro facebook e twitter copro...        0
2  trump as a student i used to hear for years fo...        0
3  2 hours since last tweet from trump maybe he i...        0
4  you get a tie and you get a tie trump s rally ...        0

Biden tweets with cluster labels:
                                       cleaned_tweet  cluster
0  elecciones2020  en florida joebiden dice que d...        4
1  hunterbiden hunterbidenemails joebiden joebide...        3
2     this is how biden made his  trumpisnotameri...        1
3   watching and setting dvr lets give him bonus ...        3
4  censorship hunterbiden biden bidenemails biden...        0


In [None]:
# get feature names (terms) from tf-idf vectorizer for understanding clusters
terms = tfidf_vectorizer.get_feature_names_out() # list of terms used in tf-idf matrix

# get top terms for each cluster center, ordered by importance
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] # get indices of top terms per cluster

print("Top terms per cluster:") # start output for cluster analysis
# loop through each cluster to display top terms and sample tweets
for i in range(num_clusters):
    print(f"\nCluster {i}:") # print cluster number
    top_terms = [terms[ind] for ind in order_centroids[i, :10]] # get top 10 terms for current cluster
    print("Top terms:", top_terms) # print top terms for current cluster

    # display sample tweets from trump and biden in current cluster
    sample_tweets = df_trump[df_trump['cluster'] == i]['cleaned_tweet'].head(3).tolist() # trump sample tweets
    sample_tweets += df_biden[df_biden['cluster'] == i]['cleaned_tweet'].head(3).tolist() # biden sample tweets
    print("Sample tweets:", sample_tweets) # print sample tweets for current cluster

Top terms per cluster:

Cluster 0:
Top terms: ['trump', 'biden', 'election2020', 'donaldtrump', 'vote', 'amp', 'trump2020', 'elections2020', 'election', 'president']
Sample tweets: ['usa 2020 trump contro facebook e twitter coprono biden\xa0  donaldtrump  ', 'trump as a student i used to hear for years for ten years i heard china in 2019 and we have 15 and they dont know how many we have and i asked them how many do we have and they said sir we dont know but we have millions like 300 million  um what', '2 hours since last tweet from trump maybe he is very busy tremendously busy', 'censorship hunterbiden biden bidenemails bidenemail corruption ', 'in 2020 nypost is being censorship censored by twitter to manipulate a us election in favor of joebiden and against trump  but ccp from china or porn on twitter   thats always been fine for       is  sick', ' tell politicians to stick it with this free item    2020 biden deomocrat election politician politics president republican trump vpdebat

In [None]:
from textblob import TextBlob

# function to get sentiment score for each tweet
def get_sentiment_score(text):  # returns polarity of the text (-1 to 1)
    analysis = TextBlob(text)  # create TextBlob object
    return analysis.sentiment.polarity  # return polarity score

# apply sentiment analysis to each dataset
df_trump['sentiment_score'] = df_trump['cleaned_tweet'].apply(get_sentiment_score)  # trump sentiment
df_biden['sentiment_score'] = df_biden['cleaned_tweet'].apply(get_sentiment_score)  # biden sentiment

# calculate average sentiment for each cluster
trump_cluster_sentiment = df_trump.groupby('cluster')['sentiment_score'].mean()  # trump avg sentiment by cluster
biden_cluster_sentiment = df_biden.groupby('cluster')['sentiment_score'].mean()  # biden avg sentiment by cluster

# print sentiment results
print("Average Sentiment by Cluster for Trump Tweets:")
print(trump_cluster_sentiment)

print("\nAverage Sentiment by Cluster for Biden Tweets:")
print(biden_cluster_sentiment)

Average Sentiment by Cluster for Trump Tweets:
cluster
0    0.046928
1    0.006283
2    0.000162
3    0.067517
4    0.005565
Name: sentiment_score, dtype: float64

Average Sentiment by Cluster for Biden Tweets:
cluster
0    0.073713
1    0.007577
2    0.005295
3    0.082120
4    0.008179
Name: sentiment_score, dtype: float64


In [None]:
# get feature names from tf-idf vectorizer
terms = tfidf_vectorizer.get_feature_names_out()  # list of terms in tf-idf matrix

# get top terms for each cluster center
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]  # indices of terms ordered by importance for each cluster

print("Top terms per cluster:")

# loop through each cluster to print top terms
for i in range(num_clusters):
    print(f"\nCluster {i} Top Terms:")  # print cluster number
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]  # get top 10 terms for current cluster
    print("Top terms:", top_terms)  # print top terms for current cluster

Top terms per cluster:

Cluster 0 Top Terms:
Top terms: ['trump', 'biden', 'election2020', 'donaldtrump', 'vote', 'amp', 'trump2020', 'elections2020', 'election', 'president']

Cluster 1 Top Terms:
Top terms: ['biden', 'trump', 'election2020', 'elections2020', 'vote', 'joe', 'donaldtrump', 'president', 'bidenharris2020', 'usa']

Cluster 2 Top Terms:
Top terms: ['trump', 'biden', 'donald', 'trump2020', 'president', 'vote', 'election2020', 'elections2020', 'joebiden', 'donaldtrump']

Cluster 3 Top Terms:
Top terms: ['joebiden', 'donaldtrump', 'kamalaharris', 'election2020', 'president', 'joe', 'vote', 'bidenharris2020', 'biden', 'america']

Cluster 4 Top Terms:
Top terms: ['la', 'que', 'en', 'el', 'trump', 'le', 'biden', 'los', 'se', 'les']


In [None]:
# Geospatial analysis
geolocator = Nominatim(user_agent="geoapi")
def get_coordinates(location):
    if pd.isnull(location) or location.strip() == '':
        return None
    try:
        loc = geolocator.geocode(location)
        return [loc.latitude, loc.longitude] if loc else None
    except Exception as e:
        return None

df_trump['coordinates'] = df_trump['state'].apply(get_coordinates)
df_biden['coordinates'] = df_biden['state'].apply(get_coordinates)

# Create the heatmap visualization
def create_heatmap(data, title):
    map_ = folium.Map(location=[37.0902, -95.7129], zoom_start=5)
    heat_data = [coords for coords in data['coordinates'] if coords]  # remove invalid entries
    HeatMap(heat_data).add_to(map_)
    map_.save(f"{title}.html")
    print(f"Heatmap saved as {title}.html")

create_heatmap(df_trump, "trump_sentiment_heatmap")
create_heatmap(df_biden, "biden_sentiment_heatmap")

# Plot the sentiment distribution
def plot_sentiment(data, title):
    sns.countplot(x='sentiment', data=data, palette='viridis')
    plt.title(title)
    plt.xlabel("Sentiment")
    plt.ylabel("Count")
    plt.show()

plot_sentiment(df_trump, "Trump Tweet Sentiments")
plot_sentiment(df_biden, "Biden Tweet Sentiments")