In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Step 1: Use Vader to generate the sentiment scores

In [2]:
# assume dataframe has a field called "clean_tweets" and write it out to file
def getSentimentScores(df, filename):
    sid = SentimentIntensityAnalyzer()
       
    df["positive"] = [sid.polarity_scores(tweet)["pos"] for tweet in df.clean_tweets]
    df["negative"] = [sid.polarity_scores(tweet)["neg"] for tweet in df.clean_tweets]
    df["neutral"]  = [sid.polarity_scores(tweet)["neu"] for tweet in df.clean_tweets]
    df["compound"] = [sid.polarity_scores(tweet)["compound"] for tweet in df.clean_tweets]
    
    df.to_csv(filename)

In [3]:
# from the preprocessing steps there are "Improved Trump Tweets.csv" and "Improved Hillary Tweets.csv

getSentimentScores(pd.DataFrame.from_csv("Improved Hillary Tweets.csv"), "HillaryWithSentiments.csv")
getSentimentScores(pd.DataFrame.from_csv("Improved Trump Tweets.csv"), "TrumpWithSentiments.csv")

# Step 2: Location based sentiment scores

Using the sentiment scores from step 1, the first analysis is to group them by state

In [4]:
# US states and territories taken from:
# http://code.activestate.com/recipes/577305-python-dictionary-of-us-states-and-territories/

states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [15]:
# Instead of looking at, for example, the positive sentiment score averaged over all tweets, 
# we are interested in looking at the positive sentiment score averaged all tweets that have are above some threshold.
# In particular, if the threshold is 0, we want to look at only tweets that at least have some positive scoring

def myMean(array, threshold=0.2):
    if len(array[array>threshold]) > 0:
        return sum(array[array>threshold])/len(array[array>threshold])
    else:
        return 0

def myCount(array, threshold=0):
    return len(array[array>threshold])

def getStateStatistics():
    hillary = pd.DataFrame.from_csv("HillaryWithSentiments.csv")
    trump   = pd.DataFrame.from_csv("TrumpWithSentiments.csv")
    
    # get statistics for each state: num of tweets for a state, the average sentiment scores, 
    # number of tweets that has nonzero positive or negative score
    averageScores = {}
    for state in states.keys():
        averageScores[state] = [len(hillary.loc[hillary['state_abbs'] == state]),
                                myMean(hillary.loc[hillary['state_abbs'] == state]['compound'],-2), 
                                myMean(hillary.loc[hillary['state_abbs'] == state]['positive']),
                                myMean(hillary.loc[hillary['state_abbs'] == state]['neutral']),
                                myMean(hillary.loc[hillary['state_abbs'] == state]['negative']),
                                myCount(hillary.loc[hillary['state_abbs'] == state]['positive']),
                                myCount(hillary.loc[hillary['state_abbs'] == state]['negative']),
                                len(trump.loc[trump['state_abbs'] == state]),
                                myMean(trump.loc[trump['state_abbs'] == state]['compound'],-2), 
                                myMean(trump.loc[trump['state_abbs'] == state]['positive']),
                                myMean(trump.loc[trump['state_abbs'] == state]['neutral']),
                                myMean(trump.loc[trump['state_abbs'] == state]['negative']),
                                myCount(trump.loc[trump['state_abbs'] == state]['positive']),
                                myCount(trump.loc[trump['state_abbs'] == state]['negative'])]
        
    # generate a dataframe and do some ordering and filtering on the averages
    stateAvgs = pd.DataFrame.from_dict(averageScores, orient='index')
    stateAvgs.insert(0,'state',stateAvgs.index)
    
    stateAvgs.columns = ['state', 'H Num', 'Avg Comp H', 'Avg Pos H', 'Avg Neu H', 'Avg Neg H', 
                         'Num Pos H', 'Num Neg H', 'T Num', 'Avg Comp T', 'Avg Pos T', 'Avg Neu T', 
                         'Avg Neg T', 'Num Pos T', 'Num Neg T']
    
    stateAvgs.sort_values(by='state', inplace=True)
    
    # eliminate US territories; we keep DC
    for extra in ['AS', 'GU', 'MP', 'NA', 'VI', 'PR']:
        stateAvgs = stateAvgs[stateAvgs.state != extra]

    stateAvgs.index=range(51)
    stateAvgs.to_csv('stateAvgs.csv')

In [65]:
getStateStatistics()

In [2]:
stateAvgs = pd.DataFrame.from_csv("stateAvgs.csv")
stateAvgs

Unnamed: 0,state,H Num,Avg Comp H,Avg Pos H,Avg Neu H,Avg Neg H,Num Pos H,Num Neg H,T Num,Avg Comp T,Avg Pos T,Avg Neu T,Avg Neg T,Num Pos T,Num Neg T
0,AK,11,-0.1258,0.2555,0.764,0.3344,3,8,8,0.17205,0.333,0.871,0.211,5,1
1,AL,88,-0.097335,0.295571,0.774148,0.288579,53,48,100,0.000149,0.34795,0.77868,0.31185,57,51
2,AR,72,0.012082,0.231,0.865028,0.3014,33,22,32,0.141634,0.282357,0.761469,0.0,20,22
3,AZ,113,-0.214465,0.29275,0.745637,0.311837,51,72,136,-0.06916,0.363524,0.790567,0.319395,54,74
4,CA,770,-0.046683,0.28178,0.811157,0.330774,328,352,894,-0.013163,0.326318,0.791936,0.31649,436,428
5,CO,144,-0.038433,0.246432,0.772104,0.287762,79,83,111,-0.006689,0.28885,0.797297,0.29092,55,54
6,CT,83,-0.024806,0.2595,0.829337,0.272545,43,40,63,-0.062267,0.293091,0.797841,0.2548,29,45
7,DC,204,0.00309,0.267588,0.826147,0.271,89,86,193,-0.007446,0.29845,0.839249,0.297304,81,92
8,DE,16,0.049681,0.233,0.842625,0.0,7,9,11,0.072709,0.88,0.740429,0.28,7,5
9,FL,622,-0.102388,0.351,0.802103,0.308771,250,349,510,-0.072037,0.3509,0.79506,0.295839,246,284


# Step 3: "Prediction" Results

Try using the average state scores to see if we can predict which candidate wins which state.

In [67]:
def getStatePredictions():
    sentimentScores = pd.DataFrame.from_csv('stateAvgs.csv')
    sentimentScores.sort_values(by='state', inplace=True, ascending=True)
    # The rule to determine which state wins is simple. For a given state, whichever candidate had more average 
    # positive sentiment score and smaller negative sentiment score wins. 
    # If this results in a tie then whoever has greater average compound score wins.
    
    result = [0] * 51

    for j in range(0,51):
        
        scoreH = 0
        scoreT = 0

        if (sentimentScores['Avg Pos H'][j] > sentimentScores['Avg Pos T'][j]): 
            scoreH = scoreH + 1
        else:
            scoreT = scoreT + 1
            
        if (sentimentScores['Avg Neg H'][j] < sentimentScores['Avg Neg T'][j]): 
            scoreH = scoreH + 1
        else:
            scoreT = scoreT + 1

        if scoreH > scoreT:
            result[j] = "Hillary"
        elif scoreT > scoreH:
            result[j] = "Trump"
        elif sentimentScores['Avg Comp H'][j] > sentimentScores['Avg Comp T'][j]:
#             print(j)
            result[j] = "Hillary"
        else:
#             print(j)
            result[j] = "Trump"
    
    # write out the predicted results
    resultsDf = pd.DataFrame()
    
    # write also real results
    realResult = pd.DataFrame.from_csv("electionresult.csv")
    
    resultsDf["state"]       = sentimentScores["state"] 
    resultsDf["Prediction"]  = result
    resultsDf['Real Result'] = realResult["Winner"].values
    resultsDf.to_csv("predictedResults.csv")
    
    # print an accuracy score
    print("Accuracy Score: " + str(sum([ x==y for (x,y) in 
                                    zip(result, realResult["Winner"].values.tolist())])/51))

In [69]:
df = pd.DataFrame.from_csv("predictedResults.csv")
df

Unnamed: 0,state,Prediction,Real Result
0,AK,Trump,Trump
1,AL,Trump,Trump
2,AR,Trump,Trump
3,AZ,Trump,Trump
4,CA,Trump,Hillary
5,CO,Trump,Hillary
6,CT,Trump,Hillary
7,DC,Hillary,Hillary
8,DE,Trump,Hillary
9,FL,Trump,Trump
