# Political Social Media Analysis

In this project, I will try to compare the tweets of Donald Trump, Barrack Obama, and Hillary Clinton to come up with meaningful insights

In this notebook, I will import the cleaned data and come up with as many insights as possible

There are 3 CSV files which will be used:
1. DonaldTrumpClean
2. BarackObamaClean
3. HillaryClintonClean

All 3 have the same structure
date,retweet,text,author

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Read the clean data

In [None]:
trump = pd.read_csv("data/DonaldTrumpClean.csv")
obama = pd.read_csv("data/BarackObamaClean.csv")
clinton = pd.read_csv("data/HillaryClintonClean.csv")

In [None]:
print(len(trump), len(obama), len(clinton))

In [None]:
trump.drop('Unnamed: 0', axis=1, inplace=True)
obama.drop('Unnamed: 0', axis=1, inplace=True)
clinton.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
trump['date'] = pd.to_datetime(trump['date'])
obama['date'] = pd.to_datetime(obama['date'])
clinton['date'] = pd.to_datetime(clinton['date'])

### Insight 1: Analysis of how active each of them are

Time frame for the dataset is 01-01-2014 to 13-10-2016

In [None]:
xpos = ["Trump", "Obama", "Clinton"]
ypos = [len(trump), len(obama), len(clinton)]
plt.bar(xpos, ypos)

title = "Number of tweets from Jan-14 to Oct-16"
plt.suptitle(title, fontsize=16)
plt.xlabel("Politician", fontsize=12)
plt.ylabel("Tweets", fontsize=12)

In [None]:
width = 0.5       # the width of the bars

fig, ax = plt.subplots()
plt.figure(figsize=(3,4))

rects1 = ax.bar(xpos, ypos, width, color='r')
title = "Number of tweets from Jan-14 to Oct-16"


def autolabel(rects):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                '%d' % int(height),
                ha='center', va='bottom')
        
autolabel(rects1)

plt.suptitle(title)
plt.show()

### Time series of number of tweets every quarter

In [None]:
trump_copy = trump.copy()
obama_copy = obama.copy()
clinton_copy = clinton.copy()

trump_copy['quarters'] = trump_copy.date.dt.quarter + (trump_copy.date.dt.year - 2014)*4
obama_copy['quarters'] = obama_copy.date.dt.quarter + (obama_copy.date.dt.year - 2014)*4
clinton_copy['quarters'] = clinton_copy.date.dt.quarter + (clinton_copy.date.dt.year - 2014)*4

In [None]:
trump_copy['quarters'] = trump_copy['quarters'].astype(int)
obama_copy['quarters'] = obama_copy['quarters'].astype(int)
clinton_copy['quarters'] = clinton_copy['quarters'].astype(int)

In [None]:
trumpQuartCount = trump_copy.groupby(['quarters'])['text'].count()
obamaQuartCount = obama_copy.groupby(['quarters'])['text'].count()
clintonQuartCount = clinton_copy.groupby(['quarters'])['text'].count()

In [None]:
trumpQuartCount.drop(12, inplace=True)
clintonQuartCount.drop(12, inplace=True)
obamaQuartCount.drop(12, inplace=True)

In [None]:
#Since clinton has no data for 2014, we will add those quarters to have 0 tweets
for i in range(1,5):
    clintonQuartCount[i] = 0 
clintonQuartCount.sort_index(inplace=True)

In [None]:
quarters = ['2014-Q1', '2014-Q2', '2014-Q3', '2014-Q4', 
           '2015-Q1', '2015-Q2', '2015-Q3', '2015-Q4',
           '2016-Q1', '2016-Q2', '2016-Q3']

In [None]:
plt.figure(figsize=(12, 6))  # width:20, height:6
plt.title("Politicians' tweets per quarter from 2014 to 2016", fontsize=20)
plt.plot(quarters, obamaQuartCount)
plt.plot(quarters, trumpQuartCount)
plt.plot(quarters, clintonQuartCount)

plt.legend(['Obama\'s Tweets', 'Trump\'s Tweets', 'Hillary\'s Tweets'], loc='upper left')
plt.show()

## Creating wordclouds

Now we'll be creating wordclouds to have a rough gauge of the most frequently used words by the different politicians before we go into analysing the sentiments

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image, ImageFilter

In [None]:
def transform_format(val):
    if val == 0:
        return 255
    else:
        return val

In [None]:
trump_mask = np.array(Image.open("images/trump.png"))
# Transform your mask into a new one that will work with the function:
transformed_trump_mask = np.ndarray((trump_mask.shape[0],trump_mask.shape[1]), np.int32)

for i in range(len(trump_mask)):
    transformed_trump_mask[i] = list(map(transform_format, trump_mask[i]))
    
stopwords = set(STOPWORDS)
stopwords.update(["realDonaldTrump", "Donald", "Trump"])
trumpTweets = " ".join(tweet for tweet in trump.text)

In [None]:
# Create a word cloud image
wc = WordCloud(background_color="white", max_words=1000, mask=transformed_trump_mask,
               stopwords=stopwords)

# Generate a wordcloud
wc.generate(trumpTweets)

# store to file
wc.to_file("images/trumpWordcloud.png")

# show
plt.figure(figsize=[20,10], edgecolor='black')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In this Trump-shaped Wordcloud, we can see that the most common words Trump uses are `Thank`, `will`, `great` etc.

Unsurprisingly, he uses `Obama` a lot as well

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["President", "Obama", "President Obama", "bo", "ofa"])
obamaTweets = " ".join(tweet for tweet in obama.text)

In [None]:
# Create a word cloud image
wc = WordCloud(background_color="white", max_words=1000, stopwords=stopwords)

# Generate a wordcloud
wc.generate(obamaTweets)

# store to file
wc.to_file("images/obamaWordcloud.png")

# show
plt.figure(figsize=[20,10], edgecolor='black')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["Hillary", "twitter", "https", "hrc", "io"])
clintonTweets = " ".join(tweet for tweet in clinton.text)

In [None]:
clinton_mask = np.array(Image.open("images/clinton.png"))
# Transform your mask into a new one that will work with the function:
transformed_clinton_mask = np.ndarray((clinton_mask.shape[0],clinton_mask.shape[1]), np.int32)

for i in range(len(clinton_mask)):
    transformed_clinton_mask[i] = list(map(transform_format, clinton_mask[i]))

In [None]:
# Create a word cloud image
wc = WordCloud(background_color="white", max_words=1000, stopwords=stopwords, mask=transformed_clinton_mask)

# Generate a wordcloud
wc.generate(clintonTweets)

# store to file
wc.to_file("images/clintonWordcloud.png")

# show
plt.figure(figsize=[20,10], edgecolor='black')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In this Hillary-shaped wordcloud, we can see that she refers to `Donald Trump` a lot in her tweets!

## Sentiment Analysis

In [None]:
# !pip install textblob

In [None]:
from textblob import TextBlob

bloblist_desc = list()
trump_copy = trump.copy()

df_tweet_descr_str=trump['text'].astype(str)
for row in df_tweet_descr_str:
    blob = TextBlob(row)
    bloblist_desc.append((row,blob.sentiment.polarity, blob.sentiment.subjectivity))
    print(bloblist_desc)
    trump['sentiment'] = bloblist_desc[0][1]
    trump['polarity'] = bloblist_desc[0][2]
    trump.append(pd.DataFrame(bloblist_desc, columns = ['sentence','sentiment','polarity']))

In [None]:
def f(df_tweet_polarity_desc):
    if df_tweet_polarity_desc['sentiment'] > 0:
        val = "Positive"
    elif df_tweet_polarity_desc['sentiment'] == 0:
        val = "Neutral"
    else:
        val = "Negative"
    return val


In [None]:
df_tweet_polarity_desc['Sentiment_Type'] = df_tweet_polarity_desc.apply(f, axis=1)

plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.countplot(x="Sentiment_Type", data=df_tweet_polarity_desc)