# Classification of Pro-Trump or Pro-Hillary Tweet


In [1]:
import pandas as pd
import numpy as np
import string

import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier
from sklearn.externals import joblib

import pickle

## Topics Feature
topics taken from https://github.com/WiMLDS/election-data-hackathon/blob/master/candidate-tweets-oct-2016/data/ReportTwitterAnalysisofPresidentialCandidates.pdf

vectorized feature; 1 if the a word in the topic is in the tweet else 0

In [2]:
# given a topic for each tweet if a word in topic X exists in the tweet then the feature corresponding is 1, else 0
def topicFeaturesExtract(tweet):
    tweetWords = set(tweet.split())
    topics = ['email', 'russia', 'race', 'immigration', 'trust', 'sex', 'female', 'male']
    output = [0] * len(topics)
    
    wordLists = {}
    wordLists['email']  = ['emails', 'email', 'crookedhillary', 'prison', 'crooked']
    wordLists['russia'] =  ['putin', 'russia', 'crimea', 'vladimir', 'ukraine', 'russian']
    wordLists['race']   = ['white', 'black', 'racist', 'race']
    wordLists['immigration'] = ['borders', 'border', 'wall', 'mexico', 'illegal', 'immigrants', 'immigration', 
                               'trafficking']
    wordLists['trust'] = ['factcheck', 'fact', 'factchecking', 'bigleaguetruth', 'trust', 'politifact', 
                          'trustworthiness', 'lies', 'lie', 'truth', 'liar']
    wordLists['sex'] = ['sex', 'sexual', 'assault', 'rape', 'rapist', 'transgression', 'transgressions']
    
    wordLists['female'] = ['she', 'her', 'she\'s', 'herself']
    wordLists['male'] = ['he', 'his', 'he\'s', 'himself', 'him']
    
    for idx, val in enumerate(topics):
        if len(tweetWords.intersection(wordLists[val])) > 0:
            output[idx] = 1
        else:
            output[idx] = 0     
    
    return output

# Given a list of tweets return list of lists each sublist is a topic
def topicFeatures(tweetList):
    totalFeatures = [[], [], [], [], [], [], [], []]
    for tweet in tweetList:
        feats = topicFeaturesExtract(tweet)
       
        for idx, val in enumerate(feats):
            totalFeatures[idx].append(val)
        
    return totalFeatures

## Name Occurrence Feature
Three features:
1. If the words (trump, pence, donald) exists then 1 else 0
2. If the words (hillary, kaine, clinton, tim) exists then 1 else 0
3. If (bill) exists then 1 else 0

In [3]:
def trump_occurence(tweet):
    if (('trump' in nltk.word_tokenize(tweet.lower())) or 
        ('pence' in nltk.word_tokenize(tweet.lower())) or ('donald' in nltk.word_tokenize(tweet.lower()))):
        return 1
    else:
        return 0
    
def hillary_occurence(tweet):
    if (('hillary' in nltk.word_tokenize(tweet.lower())) 
        or ('kaine' in nltk.word_tokenize(tweet.lower())) 
        or ('clinton' in nltk.word_tokenize(tweet.lower())) 
        or ('tim' in nltk.word_tokenize(tweet.lower()))):
        return 1
    else:
        return 0

def bill_occurence(tweet):
    if 'bill' in nltk.word_tokenize(tweet.lower()):
        return 1
    else:
        return 0

## Distance Feature
These are four features two for the hillary camp (hillary, clinton, kaine, tim) and two for the trump camp (trump, donald, pence, mike). Given a list of negative words, the first two feature computes the minimum distance between any word in the camp and a negative word if one exists in the tweet. If no negative words exist or if no name from the camp exists then return 20. 20 is selected because it is highly unlikely a tweet will have 20 words. The third and fourth features are the same but with a list of positive words.

In [4]:
# Generate a negative word list and positive word list using the entire data set. 
# Take every unigram in the tweets and apply vader on each  unigram, 
# the word list is all unigrams with a negative vader score = 1

df1 = pd.DataFrame.from_csv('Improved Hillary Tweets.csv')
df2 = pd.DataFrame.from_csv('Improved Trump Tweets.csv')

sid = SentimentIntensityAnalyzer()

k1 = [tweet.split() for tweet in df1['clean_tweets']]
k1 = [item for sublist in k1 for item in sublist]
k1 = [(word.lower(), sid.polarity_scores(word)['neg'], sid.polarity_scores(word)['pos']) for word in k1]

k2 = [tweet.split() for tweet in df2['clean_tweets']]
k2 = [item for sublist in k2 for item in sublist]
k2 = [(word.lower(), sid.polarity_scores(word)['neg'],sid.polarity_scores(word)['pos']) for word in k2]

k = set(k1 + k2)

neg_word_list = sorted(k, key=lambda x: x[1], reverse=True)
neg_word_list = [t[0] for t in k if t[1] == 1.0] 

# add the words not hasnt cant wont didnt in the list
neg_word_list.append('not')
neg_word_list.append("hasnt")
neg_word_list.append("cant")
neg_word_list.append("wont")
neg_word_list.append("didnt")
neg_word_list.append("doesnt")
neg_word_list.append("n\'t")

# generate a positive word list as well
pos_word_list = sorted(k, key=lambda x: x[2], reverse=True)
pos_word_list = [t[0] for t in k if t[2] == 1.0] 

In [4]:
neg_word_list = pickle.load(open("neg_word_list.pkl","rb"))
pos_word_list = pickle.load(open("pos_word_list.pkl","rb"))

In [32]:
def dist_hillary(tweet, input_wordList):
    wordlist = nltk.word_tokenize(tweet.lower())
    distances = []
    
    index = []
    for name in ['hillary','kaine','tim','clinton']:
        try:
            index.append(wordlist.index(name))
        except:
            a=1
    
    if len(index) == 0:
        return 20
    
    for item in wordlist:
        if item in input_wordList:
            negative_index = wordlist.index(item)
            for idx in index:
                distances.append(abs(negative_index - idx))
    if len(distances)!=0:
        return min(distances)
    else:
        return 20
            
def dist_trump(tweet, input_wordList):
    wordlist = nltk.word_tokenize(tweet.lower())
    distances = []
    index = []
    for name in ['donald','trump','mike','pence']:
        try:
            index.append(wordlist.index(name))
        except:
            a=1
    
    if len(index) == 0:
        return 20
    for item in wordlist:
        if item in input_wordList:
            negative_index = wordlist.index(item)
            for idx in index:
                distances.append(abs(negative_index - idx))
    if len(distances)!=0:
        return min(distances)
    else:
        return 20

## n-gram sentiment feature
Pick n-gram around the important names and then output the sum of the compound scores from vader for those compounds

In [6]:
def tweet_processing(tweet_list):
    processed_list = []
    for item in tweet_list:
        item = item.lower().replace('hillary clinton', 'clinton')
        item = item.lower().replace('hrc', 'clinton')
        item = item.lower().replace('bill clinton', 'bill')
        item = item.lower().replace('donald trump', 'trump')
        item = item.lower().replace('tim kaine', 'kaine')
        item = item.lower().replace('mike pence', 'pence')
        item = item.lower().replace('@hillaryclinton', 'clinton')
        item = item.lower().replace('@timkaine', 'kaine')
        item = item.lower().replace('@realDonaldTrump', 'trump')
        item = item.lower().replace('@mike_pence', 'pence')
        processed_list.append(item)
    return processed_list

In [31]:
def tokenize_text(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences
    
    output = []
    for sent in raw_sents:
        output = output + nltk.word_tokenize(sent)
        
    return output

def create_sentences(data_frame):
    mylist = []
    for item in data_frame:
        mylist.append(tokenize_text(item))
    
    return mylist

def ngram_features_sum(tweet, n=4):
    hillary_noun_list = ['hillary','clinton','kaine', 'tim']
    trump_noun_list   = ['trump', 'donald', 'mike', 'pence']
    features = [[],[]]
    score_list_hillary = 0
    score_list_trump = 0
    sid = SentimentIntensityAnalyzer()
#     result_set    = create_sentences(tweet)
    sentence_list = create_sentences(tweet)
#     sentence_list = flatten_sentences(result_set)
#     print(len(sentence_list))
    for tokens in sentence_list:
#         print(tokens)
        sum_trump = []
        sum_hillary = []
        for item in tokens:
            index = tokens.index(item)
            if index-n < 0:
                extraEnd = n-index
            else:
                extraEnd = 0

            if index+n > (len(tokens)-1):
                extraStart = index+n - (len(tokens)-1)
            else:
                extraStart = 0

            start = max(0, index-n-extraStart)
            end   = min(len(tokens)-1, index+n+extraEnd) 
            n_gram = tokens[start:end+1]
            flat_tweet = ' '.join(n_gram)
            ss = sid.polarity_scores(flat_tweet)
            if item in trump_noun_list:
                sum_trump.append(ss['compound'])
            if item in hillary_noun_list:
                sum_hillary.append(ss['compound'])
                
        if(len(sum_trump) == 0):
            score_list_trump = 0
        else:
            score_list_trump = sum(sum_trump)
        
        if(len(sum_hillary) == 0):
            score_list_hillary = 0
        else:
            score_list_hillary = sum(sum_hillary)
            
        #score_list_trump = sum(sum_trump) / float(len(sum_trump))
        #score_list_hillary = sum(sum_hillary) / float(len(sum_hillary))
        features[0].append(score_list_trump)
        features[1].append(score_list_hillary)
        
    return features

# Generate Features
Use the above functions to generate the feature set, the input csv contains two columns
1. preprocessed tweet
3. our tag as to whether the tweet is pro-trump or pro-hillary (only for the training set)

In [8]:
# df is a pandas dataframe with Tweet column
def extractFeatures(df):
    df['Tweet'] = tweet_processing(df['Tweet'].tolist())
    
    # distance feature
    trump_distance_negative_word   = []
    hillary_distance_negative_word = []
    trump_distance_positive_word   = []
    hillary_distance_positive_word = []
    trump_occured                  = []
    hillary_occured                = []
    bill_occured                   = []

    for tweet in df['Tweet']:
        trump_distance_negative_word.append(dist_trump(tweet, neg_word_list))
        hillary_distance_negative_word.append(dist_hillary(tweet, neg_word_list))
        trump_distance_positive_word.append(dist_trump(tweet, pos_word_list))
        hillary_distance_positive_word.append(dist_hillary(tweet, pos_word_list))
        trump_occured.append(trump_occurence(tweet))
        hillary_occured.append(hillary_occurence(tweet))
        bill_occured.append(bill_occurence(tweet))
    
    # topic and adjective features
    topicFeats = topicFeatures(df['Tweet'])
    
    # ngram vader features
    value  = ngram_features_sum(df['Tweet'])

    # compound vader score for entire tweet
    sid    = SentimentIntensityAnalyzer()
    scores = [sid.polarity_scores(tweet)['compound'] for tweet in df['Tweet']]
    
    # combine the features into an output
    allFeatures = pd.DataFrame()

    allFeatures['trump_distance_negative_word']   = trump_distance_negative_word
    allFeatures['hillary_distance_negative_word'] = hillary_distance_negative_word
    allFeatures['trump_distance_positive_word']   = trump_distance_positive_word
    allFeatures['hillary_distance_positive_word'] = hillary_distance_positive_word
    allFeatures['trump_occured']                  = trump_occured
    allFeatures['hillary_occured']                = hillary_occured
    allFeatures['bill_occured']                   = bill_occured

    topics = ['email', 'russia', 'race', 'immigration', 'trust', 'sex', 'female', 'male']
    for j in range(0,8):
        allFeatures[topics[j]] = topicFeats[j]
    
    allFeatures['trump_ngram_vader']   = value[0]
    allFeatures['hillary_ngram_vader'] = value[1]
    
    allFeatures['Score'] = scores
    
    return allFeatures

In [119]:
# x = pd.DataFrame.from_csv("tagged_tweets_harman - Sheet1.csv")
y = pd.DataFrame.from_csv("pro_trump - Sheet1.csv")
z = pd.DataFrame.from_csv("Final_csv.csv")
x = pd.DataFrame.from_csv("allTaggedCANY.csv")
w = pd.DataFrame.from_csv("protrumptweets.csv")

In [120]:
# some tweets are neither pro trump nor pro hillary so there is no tag and hence drop them
x.dropna(axis=0, inplace=True)
x.index = range(0,len(x))

y.dropna(axis=0, inplace=True)
y.index = range(0,len(y))

z.dropna(axis=0, inplace=True)
z.index = range(0,len(z))

w.dropna(axis=0, inplace=True)
w.index = range(0,len(w))

tot = pd.concat([w,x,y,z])
tot = tot.drop_duplicates(subset='Tweet')
tot.index = range(0,len(tot))

In [165]:
allFeatures           = extractFeatures(tot)

allFeatures['Winner'] = tot['Winner']
allFeatures.to_csv('train.csv')

In [None]:
allFeatures = pd.DataFrame.from_csv('train.csv')

In [166]:
# take equal number of trump and hillary tweets
# allFeatures = allFeatures[allFeatures['Score'] != 0]
print(len(allFeatures))
print(sum(allFeatures['Winner']))

numTrump      = int(sum(tot['Winner']))
TrumpTweets   = allFeatures[allFeatures['Winner'] == 1]
HillaryTweets = allFeatures[allFeatures['Winner'] == 0]
HillaryTweets = HillaryTweets.sample(numTrump)

HillaryTweets.index = range(0, numTrump)

allFeats = pd.concat([TrumpTweets, HillaryTweets])
allFeats = allFeats.sample(frac=1).reset_index(drop=True)

811
296.0


# Machine Learning

In [167]:
# read in the training data (which is just allFeatures from above but doing this for modularity)

# data    = pd.DataFrame.from_csv('train.csv')
data    = allFeats
datanew = data
datanew = datanew.drop('Winner',1)

trainData, devData, trainTarget, devTarget = train_test_split(datanew, data['Winner'], train_size=0.8)

Use a MinMaxScaler to normalize the features and going to use a mlp nerual net

In [168]:
scaler    = MinMaxScaler()
learner   = MLPClassifier(solver='adam', hidden_layer_sizes=(100,), max_iter=500)
voter     = BaggingClassifier(base_estimator=learner, n_estimators=5)
pipeline  = Pipeline([('scaler', scaler), ('learner', voter)])

In [171]:
pipeline.fit(trainData,trainTarget)

Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('learner', BaggingClassifier(base_estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='cons..._estimators=5, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False))])

In [172]:
pipeline.score(devData, devTarget)

0.76470588235294112

In [174]:
joblib.dump(pipeline,"HONESTCLASSIFIER.pkl")

['HONESTCLASSIFIER.pkl']

In [17]:
pipeline.predict(devData)

final = pd.DataFrame()
final['predicted'] = pipeline.predict(devData)
final.index = devTarget.index
final['true']      = devTarget
final['tweet']     = x['Tweet'][devTarget.index]

Running the classifier onto other states and get electoral map results.

In [36]:
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [37]:
# generate total tweets that has state information
h = pd.DataFrame.from_csv("Improved Hillary Tweets.csv")
t = pd.DataFrame.from_csv("Improved Trump Tweets.csv")

tweets = h['clean_tweets'].tolist() + t['clean_tweets'].tolist()
locs   = h['state_abbs'].tolist() + t['state_abbs'].tolist()

totalFrame = pd.DataFrame()
totalFrame['Tweet'] = tweets
totalFrame['loc']   = locs

In [38]:
truelocs = list(states.keys())
for extra in ['AS', 'GU', 'MP', 'NA', 'VI', 'PR']:
    truelocs.remove(extra)

totalFrame = totalFrame[totalFrame['loc'].isin(truelocs)]
totalFrame.sort_values(by='loc', inplace=True)
totalFrame.index = range(0,len(totalFrame))

In [39]:
totalFrameFeats = extractFeatures(totalFrame)

In [86]:
def getStatePredictions(df):
    locs = list(states.keys())
    for extra in ['AS', 'GU', 'MP', 'NA', 'VI', 'PR']:
        locs.remove(extra)
    
    locs.sort()
    result = []

    for j in range(0,51):
        tmp = sum(df[df['loc'] == locs[j]]['mlanswer'])
        
        if tmp > len(df[df['loc'] == locs[j]]['mlanswer'])/2:
            result.append("Trump")
        else:
            result.append("Hillary")
            
    # write out the predicted results
    resultsDf = pd.DataFrame()
    
    # write also real results and sentiment results
    realResult = pd.DataFrame.from_csv("electionresult.csv")
    sentiresl  = pd.DataFrame.from_csv("predictedResults.csv")
    stateAvgs  = pd.DataFrame.from_csv("stateAvgs.csv")

    resultsDf["state"]           = locs
    
    resultsDf['Hillary Tweets']  = stateAvgs['H Num']
    resultsDf['Trump Tweets']    = stateAvgs['T Num']
    resultsDf['Sentiment Result'] = sentiresl["Prediction"]

    resultsDf["ML Prediction"]   = result
    resultsDf['Real Result']     = realResult["Winner"].values
    
    resultsDf.to_csv("predictedResultsML.csv")
   
    # states in which tweets where taken for training the neural nets
    trainingStates = ['CA', 'NY', 'KY']
    elimStates     = trainingStates
    for s in resultsDf["state"]:
        if (resultsDf[resultsDf["state"]==s]["Hillary Tweets"].tolist()[0] + 
            resultsDf[resultsDf["state"]==s]["Trump Tweets"].tolist()[0]) < 150:
            elimStates.append(s)
    
    resultsDf = resultsDf[resultsDf['state'].isin(elimStates) == False]
    numStates = len(resultsDf)
    
    resultsDf.index = range(0,numStates)
    # print an accuracy score
    print("Accuracy Score: " + str(sum([ x==y for (x,y) in 
                                    zip(resultsDf["ML Prediction"].values, 
                                        resultsDf["Real Result"].values.tolist())])))
    
    return resultsDf

In [89]:
# pipeline.fit(datanew, data['Winner'])

result = pipeline.predict(totalFrameFeats)

totalFrame['mlanswer'] = result

resultsDf = getStatePredictions(totalFrame)

resultsDf

NameError: name 'totalFrameFeats' is not defined

In [120]:
resultsDf.to_csv("goodResult.csv")

In [29]:

pipeline = joblib.load("HONESTCLASSIFIER.pkl")


In [173]:
hillary = pd.DataFrame.from_csv("HillaryWithSentiments.csv")
trump = pd.DataFrame.from_csv("TrumpWithSentiments.csv")
temp = pd.DataFrame()

x = 43

# temp["Tweet"] = hillary[hillary.index.isin([x])]["clean_tweets"]
temp["Tweet"] = trump[trump.index.isin([x])]["clean_tweets"]

print(temp["Tweet"])
readableOutput = {1: 'Trump', 0: 'Hillary'}

print(readableOutput[pipeline.predict(extractFeatures(temp))[0]])

43    Can't wait for Hillary supporters who have a p...
Name: Tweet, dtype: object
Hillary


In [55]:
# ENTER YOUR TWEET HERE
yourTweet = ""
temp = pd.DataFrame()
temp["Tweet"] = [yourTweet]

readableOutput = {1: 'Trump', 0: 'Hillary'}

print(readableOutput[pipeline.predict(extractFeatures(temp))[0]])

Trump


In [50]:
extractFeatures(temp)

Unnamed: 0,trump_distance_negative_word,hillary_distance_negative_word,trump_distance_positive_word,hillary_distance_positive_word,trump_occured,hillary_occured,bill_occured,email,russia,race,immigration,trust,sex,female,male,trump_ngram_vader,hillary_ngram_vader,Score
0,20,20,20,7,0,1,0,0,0,0,0,1,0,1,0,0,-0.4023,-0.4023
