In [1]:
import pandas as pd

df1 = pd.read_csv('post_election_predicted.csv')

In [2]:
df_pos = df1[df1['predictor']=='pos'].reset_index()
df_neg = df1[df1['predictor']=='neg'].reset_index()
df_gen = df1[df1['predictor']=='gen'].reset_index()

pos_string = df_pos['text']
neg_string = df_neg['text']
gen_string = df_gen['text']

import string
import re

## Importing stopwords 
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
## Importing porter stemmer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

## Importing the tokenizer 
from nltk.tokenize import TweetTokenizer

def clean_tweets(tweet):
    tweet = re.sub(r'\$\w*', '', tweet)
    
    ## Cleaing up RTs
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    ## Removing hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    ## Removing hastags
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
                word not in string.punctuation): # remove punctuation
            ##stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(word)
 
    return tweets_clean

def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary
 
# positive tweets feature set
pos_tweets_set = []
for tweet in pos_string:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    
    
## negative tweets
neg_tweets_set = []
for tweet in neg_string:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))
    
## general tweets

gen_tweets_set = []
for tweet in gen_string:
    gen_tweets_set.append((bag_of_words(tweet), 'gen'))
    
from random import shuffle 
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
 
test_set = pos_tweets_set[:40] + neg_tweets_set[:40] + gen_tweets_set[:40]
train_set = pos_tweets_set[10:] + neg_tweets_set[10:] + gen_tweets_set[10:]
print(len(train_set),len(test_set))

from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765
 
print (classifier.show_most_informative_features(10))

candidates_full_name = ['Joe Biden', 'Amy Klobuchar', 'Pete Buttigieg', 'Bloomberg', 'Bernie Sanders', 'Elizabeth Warren']
candidates_last_name = ['Biden', 'Klobuchar', 'Buttigieg', 'Bloomberg', 'Sanders', 'Warren']

def get_sentiment(name):
    
    final_results = []
    
    for potential_pres in name:
        custom_tweet_set = bag_of_words(potential_pres)
        prob_result = classifier.prob_classify(custom_tweet_set)
        name = potential_pres
        results = str(prob_result.max()) 
        neg_results = str(prob_result.prob("neg"))
        pos_results = str(prob_result.prob("pos"))
        create_dict = {'Name': name, 'Results': results, 'Negative Probability': neg_results, 'Positive Probability': pos_results}
        final_results.append(create_dict)
    
    return final_results

14268 120
0.8083333333333333
Most Informative Features
                     que = True              gen : neg    =    202.0 : 1.0
                      la = True              gen : neg    =    198.9 : 1.0
                      de = True              gen : neg    =    193.0 : 1.0
                     los = True              gen : neg    =    157.1 : 1.0
                      un = True              gen : neg    =    149.2 : 1.0
                     con = True              gen : neg    =    104.3 : 1.0
                      se = True              gen : neg    =    100.6 : 1.0
               uncertain = True              gen : pos    =     69.4 : 1.0
                  switch = True              gen : pos    =     63.9 : 1.0
                      du = True              pos : neg    =     62.2 : 1.0
None


In [6]:
post_elect_sent = get_sentiment(candidates_last_name)

In [9]:
post_elec_df = pd.DataFrame(post_elect_sent)

post_elec_df['scenario'] = 'post_election'

In [10]:
post_elec_df.to_csv('post_election_candidate_sentiment.csv')