In [2]:
import pandas as pd

df1 = pd.read_csv('post_debate_file.csv')

In [3]:
df_pos = df1[df1['predictor']=='pos'].reset_index()
df_neg = df1[df1['predictor']=='neg'].reset_index()
df_gen = df1[df1['predictor']=='gen'].reset_index()

pos_string = df_pos['text']
neg_string = df_neg['text']
gen_string = df_gen['text']

In [4]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

for p_tweet in pos_string:
    (tweet_tokenizer.tokenize(p_tweet))

In [9]:
import string
import re

## Importing stopwords 
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
## Importing porter stemmer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

## Importing the tokenizer 
from nltk.tokenize import TweetTokenizer

def clean_tweets(tweet):
    tweet = re.sub(r'\$\w*', '', tweet)
    
    ## Cleaing up RTs
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    ## Removing hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    ## Removing hastags
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
                word not in string.punctuation): # remove punctuation
            ##stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(word)
 
    return tweets_clean

In [10]:
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary
 
# positive tweets feature set
pos_tweets_set = []
for tweet in pos_string:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    
    
## negative tweets
neg_tweets_set = []
for tweet in neg_string:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))
    
## general tweets

gen_tweets_set = []
for tweet in gen_string:
    gen_tweets_set.append((bag_of_words(tweet), 'gen'))

In [12]:
from random import shuffle 
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
 
test_set = pos_tweets_set[:40] + neg_tweets_set[:40] + gen_tweets_set[:40]
train_set = pos_tweets_set[10:] + neg_tweets_set[10:] + gen_tweets_set[10:]
print(len(train_set),len(test_set))

14267 120


In [13]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765
 
print (classifier.show_most_informative_features(10))

0.8583333333333333
Most Informative Features
                disaster = True              neg : pos    =    212.4 : 1.0
                complete = True              neg : pos    =    212.1 : 1.0
                    grow = True              neg : pos    =    209.9 : 1.0
                  google = True              neg : pos    =    149.4 : 1.0
                   backs = True              pos : neg    =    142.3 : 1.0
                 notmeus = True              pos : gen    =    134.7 : 1.0
                      en = True              gen : neg    =    130.3 : 1.0
                      de = True              gen : neg    =    119.8 : 1.0
        berniebeatstrump = True              pos : gen    =    119.3 : 1.0
                     que = True              gen : neg    =    113.4 : 1.0
None


In [14]:
test_tweet1 = "Bernie Sanders Just Got The Latino Endorsement"
custom_tweet_set = bag_of_words(test_tweet1)
prob_result = classifier.prob_classify(custom_tweet_set)
print ('Overall Result: ' + str(prob_result.max())) 
print ('Negative Tweet: ' + str(prob_result.prob("neg"))) 
print ('Positive Tweet: ' + str(prob_result.prob("pos")))
print ('General Tweet: ' + str(prob_result.prob("gen")))

Overall Result: pos
Negative Tweet: 0.00025418195725589746
Positive Tweet: 0.9996675036706884
General Tweet: 7.831437205665417e-05


In [15]:
test_tweet2 = "Amy Klobuchar"
custom_tweet_set = bag_of_words(test_tweet2)
prob_result = classifier.prob_classify(custom_tweet_set)
print ('Overall Result: ' + str(prob_result.max())) 
print ('Negative Tweet: ' + str(prob_result.prob("neg"))) 
print ('Positive Tweet: ' + str(prob_result.prob("pos")))
print ('General Tweet: ' + str(prob_result.prob("gen")))

Overall Result: pos
Negative Tweet: 0.05982587862355717
Positive Tweet: 0.9400752929779851
General Tweet: 9.88283984568889e-05


In [16]:
test_tweet3 = "Bernie sucks!"
custom_tweet_set = bag_of_words(test_tweet3)
prob_result = classifier.prob_classify(custom_tweet_set)
print ('Overall Result: ' + str(prob_result.max())) 
print ('Negative Tweet: ' + str(prob_result.prob("neg"))) 
print ('Positive Tweet: ' + str(prob_result.prob("pos")))
print ('General Tweet: ' + str(prob_result.prob("gen")))

Overall Result: pos
Negative Tweet: 0.4729993399485273
Positive Tweet: 0.5172007261627484
General Tweet: 0.009799933888723748


In [17]:
candidates_full_name = ['Joe Biden', 'Amy Klobuchar', 'Pete Buttigieg', 'Bloomberg', 'Bernie Sanders', 'Elizabeth Warren']
candidates_last_name = ['Biden', 'Klobuchar', 'Buttigieg', 'Bloomberg', 'Sanders', 'Warren']

def get_sentiment(name):
    
    final_results = []
    
    for potential_pres in name:
        custom_tweet_set = bag_of_words(potential_pres)
        prob_result = classifier.prob_classify(custom_tweet_set)
        name = potential_pres
        results = str(prob_result.max()) 
        neg_results = str(prob_result.prob("neg"))
        pos_results = str(prob_result.prob("pos"))
        create_dict = {'Name': name, 'Results': results, 'Negative Probability': neg_results, 'Positive Probability': pos_results}
        final_results.append(create_dict)
    
    return final_results

In [18]:
post_sentiment = get_sentiment(candidates_last_name)

In [19]:
post_df = pd.DataFrame(post_sentiment)

In [20]:
post_df['scenario'] = 'post_debate'

In [21]:
post_df.to_csv('post_debate_candidate_sentiment.csv')

In [22]:
positives = df1[df1['predictor']=='pos']
negatives = df1[df1['predictor']=='neg']
general = df1[df1['predictor']=='gen']

In [23]:
print("The amount of predicted general tweets were: {} \nThe amount of predicted positive tweets were: {} \nThe amount of predicted negative tweets were: {}".format(len(general),len(positives),len(negatives)))

The amount of predicted general tweets were: 2488 
The amount of predicted positive tweets were: 5363 
The amount of predicted negative tweets were: 6446


In [24]:
import pickle
filename = 'post_debate_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [56]:
loaded_model = pickle.load(open(filename, 'rb'))

In [57]:
print(loaded_model.show_most_informative_features(10))

Most Informative Features
                 potenti = True              neg : pos    =    268.1 : 1.0
                 complet = True              neg : pos    =    202.6 : 1.0
                      en = True              gen : neg    =    159.8 : 1.0
                  notmeu = True              pos : gen    =    140.0 : 1.0
                    fear = True              neg : pos    =    127.9 : 1.0
        berniebeatstrump = True              pos : gen    =    127.4 : 1.0
                     del = True              gen : neg    =    124.7 : 1.0
           berniewoniowa = True              pos : neg    =    123.2 : 1.0
                      de = True              gen : neg    =    120.5 : 1.0
                    bigu = True              pos : neg    =    118.7 : 1.0
None


In [49]:
candidates_full_name = ['Joe Biden', 'Amy Klobuchar', 'Pete Buttigieg', 'Bloomberg', 'Bernie Sanders', 'Elizabeth Warren']
candidates_last_name = ['Biden', 'Klobuchar', 'Buttigieg', 'Bloomberg', 'Sanders', 'Warren']

def get_sentiment_pickle(name):
    
    final_results = []
    
    for potential_pres in name:
        custom_tweet_set = bag_of_words(potential_pres)
        prob_result = loaded_model.prob_classify(custom_tweet_set)
        name = potential_pres
        results = str(prob_result.max()) 
        neg_results = str(prob_result.prob("neg"))
        pos_results = str(prob_result.prob("pos"))
        create_dict = {'Name': name, 'Results': results, 'Negative Probability': neg_results, 'Positive Probability': pos_results}
        final_results.append(create_dict)
    
    return final_results

In [51]:
get_sentiment_pickle = get_sentiment(candidates_last_name)

print(get_sentiment_pickle)

[{'Name': 'Biden', 'Results': 'pos', 'Negative Probability': '0.41439243029682404', 'Positive Probability': '0.5696958819679321'}, {'Name': 'Klobuchar', 'Results': 'pos', 'Negative Probability': '0.1793254022356111', 'Positive Probability': '0.8061365276892786'}, {'Name': 'Buttigieg', 'Results': 'pos', 'Negative Probability': '0.3715291786987502', 'Positive Probability': '0.5941046002990142'}, {'Name': 'Bloomberg', 'Results': 'neg', 'Negative Probability': '0.5326488253926238', 'Positive Probability': '0.4096469210687091'}, {'Name': 'Sanders', 'Results': 'pos', 'Negative Probability': '0.2084615288329245', 'Positive Probability': '0.7547557660546834'}, {'Name': 'Warren', 'Results': 'pos', 'Negative Probability': '0.23843938422971764', 'Positive Probability': '0.7319077032466412'}]


In [52]:
post_df = pd.DataFrame(get_sentiment_pickle)

In [53]:
post_df

Unnamed: 0,Name,Negative Probability,Positive Probability,Results
0,Biden,0.414392430296824,0.5696958819679321,pos
1,Klobuchar,0.1793254022356111,0.8061365276892786,pos
2,Buttigieg,0.3715291786987502,0.5941046002990142,pos
3,Bloomberg,0.5326488253926238,0.4096469210687091,neg
4,Sanders,0.2084615288329245,0.7547557660546834,pos
5,Warren,0.2384393842297176,0.7319077032466412,pos


In [27]:
def get_tokens(tweet):
    words = clean_tweets(tweet)    
    return words 

candidate_names = ['biden', 'klobuchar', 'buttigieg', 'bloomberg', 'sanders', 'warren',
                  'joe biden', 'amy klobuchar', 'pete buttigieg', 'michael bloomberg',
                   'bernie sanders', 'elizabeth warren',
                  'joe','amy','pete','michael','bernie','liz','elizabeth','bernard']
list_comp = []
found_match = []

for i,v in df1.iterrows():
    text = v['text']
    tokenize = get_tokens(text)
    list_comp = [name for name in tokenize if name in candidate_names]
    found_match.append(list_comp)
    
df1['associated_candidate'] = found_match

In [29]:
df1.to_csv('post_debate_predicted.csv')