In [1]:
# Required libraries
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust the width to display everything
pd.set_option('display.max_colwidth', None)  # Show full column content

from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import emoji
from datetime import datetime

In [2]:
# Load data
twitter_df = pd.read_csv(r"C:\Users\User\iCloudDrive\Cursos\Data Circle\DataCircle_Twitter_Project\twitter_cleaned_data.csv", lineterminator='\n')

In [46]:
# Clean column names
twitter_df.columns = twitter_df.columns.str.replace('\r', '')

twitter_df['country'] = twitter_df['country'].str.replace('\r', '')

# Convert created_at to datetime
twitter_df['created_at'] = pd.to_datetime(twitter_df['created_at'], errors='coerce')

# Create a column for 'created_at' with date only (without time info)
twitter_df["created_at_date"] = twitter_df["created_at"].dt.date

# Convert to datetime
twitter_df["created_at_date"] = pd.to_datetime(twitter_df["created_at_date"])

In [20]:
# Extract hashtags
twitter_df['hashtag'] = twitter_df['tweet'].str.findall(r'(#\w+)')

# Convert the list of hashtags to a string
twitter_df['hashtag'] = twitter_df['hashtag'].apply(lambda x: ', '.join(x))

In [4]:
# Sentiment polarity function
def get_polarity(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    return polarity

# Apply sentiment polarity function to the DataFrame
twitter_df['polarity'] = twitter_df['tweet_cleaned'].apply(get_polarity)

In [6]:
# Create a column for Sentiment analysis
twitter_df['sentiment'] = twitter_df['polarity'].apply(lambda i: 'positive' if i > 0 else ('neutral' if i == 0 else 'negative'))

In [47]:
twitter_df.head(5)

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_join_date,user_followers_count,user_location,city,state,candidate,tweet_cleaned,country,polarity,sentiment,hashtag,created_at_date
0,2020-10-15 00:00:02,1316529228091846912,"#Trump: As a student I used to hear for years, for ten years, I heard China! In 2019! And we have 1.5 and they don't know how many we have and I asked them how many do we have and they said 'sir we don't know.' But we have millions. Like 300 million.\n\nUm. What?",2,1,twitter web app,8436472,2007-08-26 05:56:11,1185,portland,portland,oregon,trump,#trump student used hear years ten years heard china 2019 15 dont know many asked many said sir dont know millions like 300 million um,united states,0.333333,positive,#Trump,2020-10-15
1,2020-10-15 00:00:02,1316529227471237120,2 hours since last tweet from #Trump! Maybe he is VERY busy. Tremendously busy.,0,0,trumpytweeter,828355589206056960,2017-02-05 21:32:17,32,unknown,unknown,unknown,trump,2 hours since last tweet #trump maybe busy tremendously busy,unknown,0.066667,positive,#Trump,2020-10-15
2,2020-10-15 00:00:08,1316529252301451264,You get a tie! And you get a tie! #Trump ‘s rally #Iowa https://t.co/jJalUUmh5D,4,3,twitter for iphone,47413798,2009-06-15 19:05:35,5393,washington dc,washington,district of columbia,trump,get tie get tie #trump rally #iowa,united states,0.0,neutral,"#Trump, #Iowa",2020-10-15
3,2020-10-15 00:00:17,1316529291052675072,@CLady62 Her 15 minutes were over long time ago. Omarosa never represented the black community! #TheReidOut \n\nShe cried to #Trump begging for a job!,2,0,twitter for android,1138416104,2013-02-01 01:37:38,2363,perriscalifornia,unknown,california,trump,clady62 15 minutes long time ago omarosa never represented black community #thereidout cried #trump begging job,united states,-0.108333,negative,"#TheReidOut, #Trump",2020-10-15
4,2020-10-15 00:00:17,1316529289949569024,@richardmarx Glad u got out of the house! DICK!!#trump 2020💪🏽🇺🇸🇺🇸,0,0,twitter for iphone,767401841030209536,2016-08-21 16:43:51,75,powell tn,unknown,unknown,trump,richardmarx glad u got house dick#trump 2020,unknown,0.5,positive,#trump,2020-10-15


In [48]:
# Create CSV for sentiment analysis
twitter_df.to_csv('twitter_sentiment.csv', index=False)

In [76]:
# WordCloud Function
def generate_wordcloud(df, sentiment):
        # Get the text to be used
        text = ' '.join(df[df['sentiment'] == sentiment]['tweet_cleaned'])

        # Add non important words as stopwords
        stopwords = STOPWORDS.union({
                'amp', 'biden', 'joebiden', 'joe', 'trump', 'realdonaldtrump', 
                'donaldtrump', 'trumps', 'vote', 'people', 'president', 'kamalaharri', 'u', 'kamalaharris', 'say', 'us', 'one', 'gop',
                'donald', 'know', 'thats', 'america', 'election2020', 'election', 'bidenharris', 'bidenharris2020', 'trump2020'
                })
        
        # Generate the word cloud
        wordcloud = WordCloud(stopwords=stopwords).generate(text)
        
        # Extract word frequencies
        word_freq = wordcloud.words_
        
        # Convert to DataFrame
        df = pd.DataFrame(list(word_freq.items()), columns=['Word', 'Frequency'])
        
        return df

In [77]:
# Hashtag WordCloud Function
def generate_hashtag_wordcloud(df, sentiment):
        # Get the text to be used and convert it to lowercase
        text = ' '.join(df[df['sentiment'] == sentiment]['hashtag']).lower()

        # Add 'amp', 'biden', and 'trump' as stopwords
        stopwords = {word.lower() for word in STOPWORDS.union({
                'amp', 'biden', 'joebiden', 'joe', 'trump', 'realdonaldtrump', 
                'donaldtrump', 'trumps', 'vote', 'people', 'president', 'kamalaharri', 'u', 'kamalaharris', 'say', 'us', 'one', 'gop',
                'donald', 'know', 'thats', 'america', 'election2020', 'election', 'bidenharris', 'bidenharris2020', 'bidenharis2020', 
                'trump2020', 'biden2020', 'elections2020', 'obama'
                })}
        
        # Generate the word cloud
        wordcloud = WordCloud(stopwords=stopwords).generate(text)
        
        # Extract word frequencies
        word_freq = wordcloud.words_
        
        # Convert to DataFrame
        df = pd.DataFrame(list(word_freq.items()), columns=['Word', 'Frequency'])
        
        return df

In [None]:
# Create a df for each candidate
biden_df = twitter_df.loc[twitter_df['candidate']=='biden']
trump_df = twitter_df.loc[twitter_df['candidate']=='trump']

In [78]:
# Apply wordcloud function for Whole Period
biden_positive_wordcloud_df = generate_wordcloud(biden_df, 'positive')
biden_negative_wordcloud_df = generate_wordcloud(biden_df, 'negative')
trump_positive_wordcloud_df = generate_wordcloud(trump_df, 'positive')
trump_negative_wordcloud_df = generate_wordcloud(trump_df, 'negative')

In [79]:
# Create CSV for WordCloud Whole Period
biden_positive_wordcloud_df.to_csv('biden_positive_wordcloud.csv', index=False)
biden_negative_wordcloud_df.to_csv('biden_negative_wordcloud.csv', index=False)
trump_positive_wordcloud_df.to_csv('trump_positive_wordcloud.csv', index=False)
trump_negative_wordcloud_df.to_csv('trump_negative_wordcloud.csv', index=False)

In [80]:
# Apply Hashtag wordcloud function for Whole Period
biden_hashtag_positive_wordcloud_df = generate_wordcloud(biden_df, 'positive')
biden_hashtag_negative_wordcloud_df = generate_wordcloud(biden_df, 'negative')
trump_hashtag_positive_wordcloud_df = generate_wordcloud(trump_df, 'positive')
trump_hashtag_negative_wordcloud_df = generate_wordcloud(trump_df, 'negative')

In [81]:
# Create CSV for Hashtag WordCloud for Whole Period
biden_hashtag_positive_wordcloud_df.to_csv('biden_hashtag_positive_wordcloud.csv', index=False)
biden_hashtag_negative_wordcloud_df.to_csv('biden_hashtag_negative_wordcloud.csv', index=False)
trump_hashtag_positive_wordcloud_df.to_csv('trump_hashtag_positive_wordcloud.csv', index=False)
trump_hashtag_negative_wordcloud_df.to_csv('trump_hashtag_negative_wordcloud.csv', index=False)

In [82]:
# Filter the dataset
biden_16_10_df = biden_df.loc[twitter_df["created_at_date"]=="2020-10-16"]
trump_16_10_df = trump_df.loc[twitter_df["created_at_date"]=="2020-10-16"]

In [83]:
# Apply wordcloud function for 16/10/20
biden_16_10_positive_wordcloud_df = generate_wordcloud(biden_16_10_df, 'positive')
biden_16_10_negative_wordcloud_df = generate_wordcloud(biden_16_10_df, 'negative')
trump_16_10_positive_wordcloud_df = generate_wordcloud(trump_16_10_df, 'positive')
trump_16_10_negative_wordcloud_df = generate_wordcloud(trump_16_10_df, 'negative')

In [84]:
# Create CSV for WordCloud for 16/10/20
biden_16_10_positive_wordcloud_df.to_csv('biden_16_10_positive_wordcloud.csv', index=False)
biden_16_10_negative_wordcloud_df.to_csv('biden_16_10_negative_wordcloud.csv', index=False)
trump_16_10_positive_wordcloud_df.to_csv('trump_16_10_positive_wordcloud.csv', index=False)
trump_16_10_negative_wordcloud_df.to_csv('trump_16_10_negative_wordcloud.csv', index=False)

In [85]:
# Apply Hashtag wordcloud function for 16/10/20
biden_16_10_hashtag_positive_wordcloud_df = generate_hashtag_wordcloud(biden_16_10_df, 'positive')
biden_16_10_hashtag_negative_wordcloud_df = generate_hashtag_wordcloud(biden_16_10_df, 'negative')
trump_16_10_hashtag_positive_wordcloud_df = generate_hashtag_wordcloud(trump_16_10_df, 'positive')
trump_16_10_hashtag_negative_wordcloud_df = generate_hashtag_wordcloud(trump_16_10_df, 'negative')

In [86]:
# Create CSV for Hashtag WordCloud for 16/10/20
biden_16_10_hashtag_positive_wordcloud_df.to_csv('biden_16_10_hashtag_positive_wordcloud.csv', index=False)
biden_16_10_hashtag_negative_wordcloud_df.to_csv('biden_16_10_hashtag_negative_wordcloud.csv', index=False)
trump_16_10_hashtag_positive_wordcloud_df.to_csv('trump_16_10_hashtag_positive_wordcloud.csv', index=False)
trump_16_10_hashtag_negative_wordcloud_df.to_csv('trump_16_10_hashtag_negative_wordcloud.csv', index=False)

In [87]:
# Filter the dataset
biden_23_10_df = biden_df.loc[biden_df["created_at_date"]=="2020-10-23"]
trump_23_10_df = trump_df.loc[trump_df["created_at_date"]=="2020-10-23"]

In [88]:
# Apply wordcloud function for 23/10/20
biden_23_10_positive_wordcloud_df = generate_wordcloud(biden_23_10_df, 'positive')
biden_23_10_negative_wordcloud_df = generate_wordcloud(biden_23_10_df, 'negative')
trump_23_10_positive_wordcloud_df = generate_wordcloud(trump_23_10_df, 'positive')
trump_23_10_negative_wordcloud_df = generate_wordcloud(trump_23_10_df, 'negative')

In [89]:
# Create CSV for WordCloud for 23/10/20
biden_23_10_positive_wordcloud_df.to_csv('biden_23_10_positive_wordcloud.csv', index=False)
biden_23_10_negative_wordcloud_df.to_csv('biden_23_10_negative_wordcloud.csv', index=False)
trump_23_10_positive_wordcloud_df.to_csv('trump_23_10_positive_wordcloud.csv', index=False)
trump_23_10_negative_wordcloud_df.to_csv('trump_23_10_negative_wordcloud.csv', index=False)

In [90]:
# Apply Hashtag wordcloud function for for 23/10/20
biden_23_10_hashtag_positive_wordcloud_df = generate_hashtag_wordcloud(biden_23_10_df, 'positive')
biden_23_10_hashtag_negative_wordcloud_df = generate_hashtag_wordcloud(biden_23_10_df, 'negative')
trump_23_10_hashtag_positive_wordcloud_df = generate_hashtag_wordcloud(trump_23_10_df, 'positive')
trump_23_10_hashtag_negative_wordcloud_df = generate_hashtag_wordcloud(trump_23_10_df, 'negative')

In [91]:
# Create CSV for Hashtag WordCloud for 23/10/20
biden_23_10_hashtag_positive_wordcloud_df.to_csv('biden_23_10_hashtag_positive_wordcloud.csv', index=False)
biden_23_10_hashtag_negative_wordcloud_df.to_csv('biden_23_10_hashtag_negative_wordcloud.csv', index=False)
trump_23_10_hashtag_positive_wordcloud_df.to_csv('trump_23_10_hashtag_positive_wordcloud.csv', index=False)
trump_23_10_hashtag_negative_wordcloud_df.to_csv('trump_23_10_hashtag_negative_wordcloud.csv', index=False)

In [92]:
# Filter the dataset
biden_03_11_df = biden_df.loc[biden_df["created_at_date"]=="2020-11-03"]
trump_03_11_df = trump_df.loc[trump_df["created_at_date"]=="2020-11-03"]

In [93]:
# Apply wordcloud function for 03/11/20
biden_03_11_positive_wordcloud_df = generate_wordcloud(biden_03_11_df, 'positive')
biden_03_11_negative_wordcloud_df = generate_wordcloud(biden_03_11_df, 'negative')
trump_03_11_positive_wordcloud_df = generate_wordcloud(trump_03_11_df, 'positive')
trump_03_11_negative_wordcloud_df = generate_wordcloud(trump_03_11_df, 'negative')

In [94]:
# Create CSV for WordCloud for 03/11/20
biden_03_11_positive_wordcloud_df.to_csv('biden_03_11_positive_wordcloud.csv', index=False)
biden_03_11_negative_wordcloud_df.to_csv('biden_03_11_negative_wordcloud.csv', index=False)
trump_03_11_positive_wordcloud_df.to_csv('trump_03_11_positive_wordcloud.csv', index=False)
trump_03_11_negative_wordcloud_df.to_csv('trump_03_11_negative_wordcloud.csv', index=False)

In [95]:
# Apply Hashtag wordcloud function for 03/11/20
biden_03_11_hashtag_positive_wordcloud_df = generate_hashtag_wordcloud(biden_03_11_df, 'positive')
biden_03_11_hashtag_negative_wordcloud_df = generate_hashtag_wordcloud(biden_03_11_df, 'negative')
trump_03_11_hashtag_positive_wordcloud_df = generate_hashtag_wordcloud(trump_03_11_df, 'positive')
trump_03_11_hashtag_negative_wordcloud_df = generate_hashtag_wordcloud(trump_03_11_df, 'negative')

In [96]:
# Create CSV for Hashtag WordCloud for 03/11/20
biden_03_11_hashtag_positive_wordcloud_df.to_csv('biden_03_11_hashtag_positive_wordcloud.csv', index=False)
biden_03_11_hashtag_negative_wordcloud_df.to_csv('biden_03_11_hashtag_negative_wordcloud.csv', index=False)
trump_03_11_hashtag_positive_wordcloud_df.to_csv('trump_03_11_hashtag_positive_wordcloud.csv', index=False)
trump_03_11_hashtag_negative_wordcloud_df.to_csv('trump_03_11_hashtag_negative_wordcloud.csv', index=False)

In [97]:
# Biden's Polarity Sentiment means over time
biden_sentiment_means = biden_df.groupby('created_at_date')['polarity'].mean()

# Trump's Polarity Sentiment means over time
trump_sentiment_means = trump_df.groupby('created_at_date')['polarity'].mean()

# Convert trumps daily means to a dataframe
trump_daily_means_df = pd.DataFrame(trump_sentiment_means).reset_index()

# Convert Bidens daily means to a dataframe
biden_daily_means_df = pd.DataFrame(biden_sentiment_means).reset_index()

In [None]:
# Get the date of the min polarity mean for each candidate
biden_min_polarity_mean_date = biden_daily_means_df.loc[biden_daily_means_df["polarity"].idxmin(), "created_at_date"]
trump_min_polarity_mean_date = trump_daily_means_df.loc[trump_daily_means_df["polarity"].idxmin(), "created_at_date"]

In [105]:
# Filter the dataset to get the rows of the min polarity mean for each candidate
biden_min_pol_date_df = biden_df.loc[(biden_df["created_at_date"]==biden_min_polarity_mean_date)]
trump_min_pol_date_df = trump_df.loc[(trump_df["created_at_date"]==trump_min_polarity_mean_date)]

In [106]:
# Apply wordcloud function for the date of the min polarity mean for each candidate
biden_min_pol_date_negative_wordcloud_df = generate_wordcloud(biden_min_pol_date_df, 'negative')
trump_min_pol_date_negative_wordcloud_df = generate_wordcloud(trump_min_pol_date_df, 'negative')

In [107]:
# Create CSV for WordCloud for the date of the min polarity mean for each candidate
biden_min_pol_date_negative_wordcloud_df.to_csv('biden_min_pol_date_negative_wordcloud.csv', index=False)
trump_min_pol_date_negative_wordcloud_df.to_csv('trump_min_pol_date_negative_wordcloud.csv', index=False)

In [108]:
# Apply Hashtag wordcloud function for the date of the min polarity mean for each candidate
biden_min_pol_date_hashtag_negative_wordcloud_df = generate_hashtag_wordcloud(biden_min_pol_date_df, 'negative')
trump_min_pol_date_hashtag_negative_wordcloud_df = generate_hashtag_wordcloud(trump_min_pol_date_df, 'negative')

In [109]:
# Create CSV for Hashtag WordCloud for the date of the min polarity mean for each candidate
biden_min_pol_date_hashtag_negative_wordcloud_df.to_csv('biden_min_pol_date_negative_hashtag_wordcloud.csv', index=False)
trump_min_pol_date_hashtag_negative_wordcloud_df.to_csv('trump_min_pol_date_negative_hashtag_wordcloud.csv', index=False)

# Emoji 

In [38]:
# Biden Emoji creation
biden_emojis_list = []
biden_recognized_emojis = set(emoji.EMOJI_DATA.keys())

for tweet in biden_df['tweet']:
    for char in tweet:
        if char in biden_recognized_emojis:
            biden_emojis_list.append(char)


# Trump Emoji creation
trump_emojis_list = []
trump_recognized_emojis = set(emoji.EMOJI_DATA.keys())

for tweet in trump_df['tweet']:
    for char in tweet:
        if char in trump_recognized_emojis:
            trump_emojis_list.append(char)

In [43]:
# Frequency count of emojis

# Biden
biden_emoji_counts = Counter(biden_emojis_list)
biden_top_emojis = biden_emoji_counts.most_common(10)

# Trump
trump_emoji_counts = Counter(trump_emojis_list)
trump_top_emojis = trump_emoji_counts.most_common(10)

In [46]:
# Create DataFrame for the top emojis
biden_emoji_df = pd.DataFrame(biden_top_emojis, columns=['Emoji', 'Frequency'])
trump_emoji_df = pd.DataFrame(trump_top_emojis, columns=['Emoji', 'Frequency'])

In [49]:
# Create CSV for emoji analysis
biden_emoji_df.to_csv('biden_emojis.csv')
trump_emoji_df.to_csv('trump_emojis.csv')

In [51]:
biden_emoji_df

Unnamed: 0,Emoji,Frequency
0,💙,17506
1,😂,12943
2,🤣,8944
3,🌊,6947
4,👏,6741
5,❤,6107
6,🏻,4997
7,🙏,4813
8,👇,3912
9,🏼,3907
