# Visualization and Sentimental Analysis of tweets surrounding the 2020 US Election

This notebook will explore and extract information from tweets regarding the 2020 US election. The data used is tweets collected using the Twitter API statuses_lookup and snsscrape with #DonaldTrump and #JoeBiden used as keywords. The tweets were collected from 15.10.2020 to 08.11.2020.
We will explore information such as the distribution of tweets by state and source; the sentiment of tweets for each candidate (negative, neutral, or positive); and the overall sentiment of tweets.

# Loading the necessary libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

import re #regex
from textblob import TextBlob #sentimate analysis
from textblob import Word
from nltk.probability import FreqDist

#graphs
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objects as go

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing the datasets

In [None]:
#data importation
trump_data = pd.read_csv('/kaggle/input/us-election-2020-tweets/hashtag_donaldtrump.csv', lineterminator='\n')#trump tweets dataset
biden_data =pd.read_csv('/kaggle/input/us-election-2020-tweets/hashtag_joebiden.csv', lineterminator='\n')#biden tweets dataset

# Initial look: Trump Tweets

In [None]:
trump_data.head()

In [None]:
trump_data.shape 

In [None]:
trump_data.describe()

In [None]:
trump_data.info()

# Initial Look: Biden Tweets

In [None]:
biden_data.head()

In [None]:
biden_data.shape

In [None]:
biden_data.describe()

In [None]:
biden_data.info()

# Cleaning the data

In [None]:
#removing columns unnecessary for analysis
trump_data = trump_data.drop(columns=['tweet_id', 'collected_at', 'user_description'])
biden_data = biden_data.drop(columns=['tweet_id', 'collected_at', 'user_description'])

In [None]:
#add a column specifying which dataset the tweet belongs to
trump_data.loc[:,'candidate'] = 'Trump'
biden_data.loc[:,'candidate'] = 'Biden'

In [None]:
trump_data.sort_values(by='created_at') #sort by creation times
trump_data.head()

In [None]:
biden_data.sort_values(by= 'created_at') #sort by creation time
biden_data.head()

In [None]:
#make country name consistent
d = {"United States of America":"United States"}
trump_data['country'].replace(d, inplace=True)
biden_data['country'].replace(d, inplace=True)

trump_data = trump_data.loc[trump_data['country'] == "United States"]
biden_data = biden_data.loc[biden_data['country'] == "United States"]


#drop the columns with null values
trump_data = trump_data.dropna()
biden_data = biden_data.dropna()

In [None]:
trump_data.shape

# Visualization by States

In [None]:
trump_data1 = trump_data.groupby(['state']).count().tweet #group by states
test1 = pd.DataFrame(trump_data1)
biden_data1 = biden_data.groupby(['state']).count().tweet
test2 = pd.DataFrame(biden_data1)

state_data = pd.merge(test1, test2, on= 'state') #join the two tables using the state
state_data = state_data.rename(columns={'tweet_x': 'Trump Tweets', 'tweet_y': 'Biden Tweets'})
state_data.head()

In [None]:
state_data.plot(kind='bar',title='Number of Tweets per State', figsize=(20,10), width = 1)
plt.xlabel('State')
plt.ylabel('Number of Tweets')

# Visualization by Top 5 Sources

In [None]:
#group by sources, put count in descending order, and select top 5 sources
trump_sources = trump_data.groupby(['source']).count().tweet.sort_values(ascending=False)[:6]
source1 = pd.DataFrame(trump_sources)
biden_sources = biden_data.groupby(['source']).count().tweet.sort_values(ascending=False)[:6]
source2 = pd.DataFrame(biden_sources)
source_data = pd.merge(source1, source2, on= 'source')
source_data
source_data = source_data.rename(columns={'tweet_x': 'Trump Tweets', 'tweet_y': 'Biden Tweets'})
source_data.head()

In [None]:
source_data.plot(kind='bar',title='Number of Tweets per Source', figsize=(20,10), width = 0.75)
plt.xlabel('Source')
plt.ylabel('Number of Tweets')

# Cleaning the tweets

In [None]:
#function to get rid of hashtags, @, and other special characters in the tweets
def tweet_cleaning(tweet):
    tweet = tweet.lower()
    to_remove = r'\d+|http?\S+|[^A-Za-z0-9]+'
    tweet = re.sub(to_remove, ' ', tweet) 
    
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)

    filtered = []
    for w in word_tokens:
        if w not in stop_words:
            filtered.append(w)
    
    return filtered

#sample to demonstrate the function 
sample = "This is is is a sample sentence to test if stop words works"
tweet_cleaning(sample)

In [None]:
trump_tweets = trump_data['tweet'].apply(lambda x: tweet_cleaning(x))
biden_tweets = biden_data['tweet'].apply(lambda x: tweet_cleaning(x))

In [None]:
trump_tweets.head()

In [None]:
biden_tweets.head()

# Sentiment Analysis

In [None]:
#functions that will assist in the sentiment analysis
def subjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity
def polarity(tweet):
    return TextBlob(tweet).sentiment.polarity
def conclusion(val):
    if val<0:
        return 'negative'
    elif val==0:
        return 'neutral'
    else:
        return 'positive'

**Trump Tweets**

In [None]:
subjectivity_col = trump_data['tweet'].apply(subjectivity)
polarity_col = trump_data['tweet'].apply(polarity)
analysis_col = polarity_col.apply(conclusion)

df = {'Tweet': trump_tweets, 'Subjectivity': subjectivity_col, 'Polarity': polarity_col, 'Sentiment': analysis_col}
sentiment_analysis = pd.DataFrame(df)
sentiment_analysis

In [None]:
neg_num = sentiment_analysis[sentiment_analysis['Sentiment']=='negative'].Sentiment.count()
neu_num = sentiment_analysis[sentiment_analysis['Sentiment']=='neutral'].Sentiment.count()
pos_num = sentiment_analysis[sentiment_analysis['Sentiment']=='positive'].Sentiment.count()

print('Sentiment Breakdown: Trump Tweets')
print('Negative Tweets: ', neg_num)
print('Neutral Tweets: ', neu_num)
print('Positive Tweets: ', pos_num)

In [None]:
num_sentiment = pd.DataFrame({'Tweet Sentiment': ['Negative', 'Neutral', 'Positive'], 'Number of Tweets': [neg_num, neu_num, pos_num]})
trump_plot = num_sentiment.plot.bar(x='Tweet Sentiment', y='Number of Tweets', title='Sentiment Analysis for Trump Tweets', color = 'red', rot=0)
plt.ylabel('Number of Tweets')
trump_plot

**Biden Tweets**

In [None]:
subjectivity_col = biden_data['tweet'].apply(subjectivity)
polarity_col = biden_data['tweet'].apply(polarity)
analysis_col = polarity_col.apply(conclusion)

df = {'Tweet': biden_tweets, 'Subjectivity': subjectivity_col, 'Polarity': polarity_col, 'Sentiment': analysis_col}
sentiment_analysis2 = pd.DataFrame(df)
sentiment_analysis2

In [None]:
neg_num2 = sentiment_analysis2[sentiment_analysis2['Sentiment']=='negative'].Sentiment.count()
neu_num2 = sentiment_analysis2[sentiment_analysis2['Sentiment']=='neutral'].Sentiment.count()
pos_num2 = sentiment_analysis2[sentiment_analysis2['Sentiment']=='positive'].Sentiment.count()

print('Sentiment Breakdown: Biden Tweets')
print('Negative Tweets: ', neg_num2)
print('Neutral Tweets: ', neu_num2)
print('Positive Tweets: ', pos_num2)

In [None]:
num_sentiment2 = pd.DataFrame({'Tweet Sentiment': ['Negative', 'Neutral', 'Positive'], 'Number of Tweets': [neg_num2, neu_num2, pos_num2]})
biden_plot = num_sentiment.plot.bar(x='Tweet Sentiment', y='Number of Tweets', title='Sentiment Analysis for Biden Tweets', color=
                                     'blue', rot=0)
plt.ylabel('Number of Tweets')
biden_plot

# Overall Sentiment

In [None]:
overall_sent = pd.DataFrame({'Trump Tweets': [neg_num, neu_num, pos_num],
                             'Biden Tweets': [neg_num2, neu_num2, pos_num2]}, 
                              index= ['Negative', 'Neutral', 'Positive'])
overall_sent

In [None]:
overall_sent.plot(kind='bar',title='Sentimental Analysis for All Tweets', figsize=(10,7))
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')