# US Political Tweet Analysis

## Introduction
In this notebook we highlight Data Science and Natural Language Processing methods to analyze tweets pertaining to US Presidential Nominees: Hillary Clinton (Democratic Party) and Donald Trump (Republican Party) before and after the election. Our analysis focuses on sentiment analysis related to each party over time, network effects/flow (echo chamber), and any other election factors.

Before following through this notebook please install all required packages listed in the [references.txt](references.txt) file as well as following the steps listed in the [README](README.md) to download the NLTK corpus.

## Imports

In [11]:
# Python 2/3 compatibility
from __future__ import print_function

import argparse
import json
import string
import re
import os, pwd
import pandas as pd
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from datetime import datetime
from textblob import TextBlob
from nltk.corpus import stopwords
from gensim import corpora, models, similarities
from pprint import pprint

## Load Data

### Helper Functions to preprocess data

In [12]:
# Helper methods which tokenize, and convert the content string
# to a list of words (can also handle #'s, @'s, etc)
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>',  # HTML tags
    r'(?:@[\w_]+)',  # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",  # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])",  # words with - and '
    r'(?:[\w_]+)',  # other words
    r'(?:\S)'  # anything else
]

punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', '...', 'I']
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
    return tokens_re.findall(s)

def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
    
def extract_http_link(s):
    r = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(r, s)
    if match:
        return match.group()
    return ''

def timestr_to_datetime(timestr):
    time = None
    try:
        time = datetime.strptime(timestr, '%m/%d/%Y %H:%M:%S')
    except ValueError, e:
        print('%s: %s' % (e, timestr))
    return time
    
def geostamp_to_list(geostamp_str):
    list = []
    try:
        if (geostamp_str != ''):
            locations_str = geostamp_str.replace('[', '').split('],')
            lists = [map(float, s.replace(']', '').split(',')) for s in locations_str]
            list = lists
    except ValueError, e:
        print('%s: %s' % (e, geostamp_str))
    return list
    
def tweet_to_list(tweet):
    # Filter out 'RT' text if it's a retweet
    if len(tweet) > 2 and tweet[:2] == 'RT':
        tweet = tweet[3:]
    lst = [term for term in preprocess(tweet) if term not in stop]
    lst = [item.lower() for item in lst]
    return lst

def get_label(sentiment):
    if sentiment > 0:
        return 'positive'
    elif sentiment < 0:
        return 'negative'
    else:
        return 'neutral'

### Create Pandas Dataframe from CSV File and do data preprocessing

In [14]:
username = pwd.getpwuid(os.getuid())[0]
# Get file from Dropbox Directory 
# If you don't have access to our Dropbox then fetch tweets using tweepy and save as a CSV file)
file_name = '/Users/{0:s}/Dropbox/US_UK_ElectionTweets/geo_time_tweets_fixed/temp_geo.csv'.format(username)
file = open(file_name)

df = pd.read_csv(file, dtype={'Geostamp': str})
df['Content'] = df.apply(lambda row: tweet_to_list(row['Content']), axis=1)
df['Geostamp'] = df.apply(lambda row: geostamp_to_list(row['Geostamp']), axis=1)
df['isHillary'] = df.apply(lambda row: bool(row['isHillary']), axis=1)
df['Timestamp'] = df.apply(lambda row: timestr_to_datetime(row['Date'] + ' ' + row['Time']), axis=1)
df.drop(df.columns[len(df.columns) - 1], axis=1, inplace=True)



## Setup Data

### Create global variables we will use later on

In [15]:
headers = None
corpus = {
    'all': [],
    'hillary': [],
    'trump': [],
    'positive': [],
    'neutral': [],
    'negative': []
}
terms = {
    'all': [],
    'filtered': [],
    'hillary': [],
    'trump': [],
    'positive': [],
    'neutral': [],
    'negative': []
}
terms_all_counter = Counter()
terms_filtered_counter = Counter()
tfidf_matrix = None
geo_data = {
    'type': 'FeatureCollection',
    'features': []
}

### Fill in variables with Twitter Data for use in Analysis

In [16]:
for index, tweet in df.iterrows():
    # Temporary Fix
    tweet['Content'] = [term.lower() for term in tweet['Content']]

    str = ' '.join(tweet['Content'])
    unicode_tweet = unicode(str, errors='replace')
    corpus['all'].append(unicode_tweet)

    if (tweet['isHillary']):
        terms['hillary'].extend(tweet['Content'])
        corpus['hillary'].append(unicode_tweet)
    else:
        terms['trump'].extend(tweet['Content'])
        corpus['trump'].append(unicode_tweet)

    filtered_list = [term for term in tweet['Content'] if not term.startswith(('#', '@'))]
    terms['filtered'].extend(filtered_list)
    terms['all'].extend(tweet['Content'])

    sentiment = get_label(tweet['Compound'])
    terms[sentiment].extend(tweet['Content'])
    corpus[sentiment].append(unicode_tweet)

    if tweet['Geostamp']:
        time = tweet['Timestamp'].strftime('%m/%d/%Y %H:%M:%S').encode('utf-8').strip()
        latlang = tweet['Geostamp'][0]
        latlang[0], latlang[1] = latlang[1], latlang[0]
        coordinates = {'coordinates': latlang, 'type': 'Point'}
        geo_json_feature = {
            'type': 'Feature',
            'geometry': coordinates,
            'properties': {
                'text': unicode_tweet,
                'created_at': time
            }
        }
        geo_data['features'].append(geo_json_feature)

## Analyze Data

### LDA Topic Modeling

In [19]:
texts = [[word for word in document.lower().split()] for document in corpus['all']]
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
dictionary = corpora.Dictionary(texts)
lda_corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(lda_corpus)
corpus_tfidf = tfidf[lda_corpus]
# Initialize an LDA transformation on the data
lda = models.LdaModel(lda_corpus, id2word=dictionary, num_topics=20)
lda.save('tweet_lda_model.lsi')
print(lda.print_topics(2))


[INFO] LDA Model Topics
[(12, u'0.265*"\ufffd" + 0.036*"#notmypresident" + 0.018*"union" + 0.015*"square" + 0.014*"my" + 0.012*"this" + 0.011*"i\'m" + 0.010*"#nevertrump" + 0.010*"not" + 0.009*"#imstillwithher"'), (7, u'0.051*"#notmypresident" + 0.025*"#protest" + 0.020*"trump" + 0.017*"#imwithher" + 0.014*"hillary" + 0.013*"#nevertrump" + 0.013*"#trumpprotest" + 0.013*"hate" + 0.011*"trumps" + 0.011*"watch"')]


### Create TFIDF Matrix

In [20]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')
tfidf_matrix =  tf.fit_transform(corpus['all'])
feature_names = tf.get_feature_names()
dense = tfidf_matrix.todense()
dense_tweets = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(dense_tweets)), dense_tweets) if pair[1] > 0]
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:len(sorted_phrase_scores)]:
    print('{0: <40} {1}'.format(phrase, score))

got signatures                           0.214273833351
got signatures number                    0.214273833351
increasing quickly                       0.214273833351
link petition                            0.214273833351
link petition ve                         0.214273833351
notmypresident tweet link                0.214273833351
number increasing                        0.214273833351
number increasing quickly                0.214273833351
petition ve                              0.214273833351
petition ve got                          0.214273833351
signatures number                        0.214273833351
signatures number increasing             0.214273833351
tweet link                               0.214273833351
tweet link petition                      0.214273833351
ve got signatures                        0.214273833351
notmypresident tweet                     0.205145032321
increasing                               0.198668044914
number                                   0.18606

### Cosine Similarity between Tweets

In [21]:
# Helper method which finds cosine similarities given a tfidf matrix and an index of a tweet in matrix
def find_cosine_similar(tfidf_matrix, index, top_n=5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index + 1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]
    
# Select a random tweet to find similar tweets based on cosine similarity
random_tweet = corpus['all'][20]
print('\n[INFO] Tweets Similar To: %s' % (random_tweet))
for index, score in find_cosine_similar(tfidf_matrix, 20):
    print('%.2f  ->  %s' % (score, corpus['all'][index]))


[INFO] Tweets Similar To: fuck @realdonaldtrump don't deserve respect never gave respect #notmypresident fake bastard https://t.co/hddodydbua
0.19  ->  if can't handle @realdonaldtrump don't deserve @barackobama #election2016 #notmypresident
0.17  ->  @cnn @realdonaldtrump never gave respect anyone doesn't deserve respect matter he's racist thief asshole #notmypresident
0.17  ->  if dealing troll trump supporters block do engage don't deserve respect #trolls #notmypresident
0.15  ->  @egg509 respect never #notmypresident
0.10  ->  stop telling respect trump man done nothing say things made lose respect #notmypresident


### Dump Geodata into JSON File to visualize
To visualize the data follow the steps listed in the [README](README.md)

In [None]:
with open('geo_data.json', 'w') as fout:
    print('\n[INFO] Dumped geo data into geo_data.json')
    fout.write(json.dumps(self.geo_data, indent=4))

### Create Counters to see most popular terms

In [None]:
# Print out 10 most frequent words filtered
terms_filtered_counter = Counter(terms['filtered'])
for word, count in terms_filtered_counter.most_common(15):
    print('{0}: {1}'.format(word, count))

# Print out 10 most unfiltered frequent words
terms_all_counter = Counter(terms['all'])
for word, count in terms_all_counter.most_common(15):
    print('{0}: {1}'.format(word, count))