In [3]:
## Import pandas dependancy and pull in the scraped file.

import pandas as pd

df1 = pd.read_csv('nevada_caucus_2_18.csv')

In [4]:
## Clean up work to get the sentiments aligned in a column like fashion

pos_s = df1[df1['pos_sentiment']=='yes'].reset_index()
neg_s = df1[df1['neg_sentiment']=='yes'].reset_index()
gen_s = df1[df1['general']=='yes'].reset_index()

## Creating indivual sets before appending them back together

pos_s['overall_sentiment'] = 'positive'
gen_s['overall_sentiment'] = 'general'
neg_s['overall_sentiment'] = 'negative'

In [5]:
## Appending datasets back together so the sentiments line up

append_df = pos_s.append(neg_s)
second_append = append_df.append(gen_s)

In [6]:
## Creating the new Dataframe for tokenizing the tweets

df_pos = second_append[second_append['overall_sentiment']=='positive'].reset_index()
df_neg = second_append[second_append['overall_sentiment']=='negative'].reset_index()
df_gen = second_append[second_append['overall_sentiment']=='general'].reset_index()

pos_string = df_pos['text']
neg_string = df_neg['text']
gen_string = df_gen['text']

In [7]:
## Tokenizing the tweets - testing

from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

def find_tweet_cand(tweets):
    for tweet in tweets:
        s= (tweet_tokenizer.tokenize(tweet))
        return s

In [8]:
import string
import re

## Importing stopwords 
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
## Importing porter stemmer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

## Importing the tokenizer 
from nltk.tokenize import TweetTokenizer

def clean_tweets(tweet):
    tweet = re.sub(r'\$\w*', '', tweet)
    
    ## Cleaing up RTs
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    ## Removing hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    ## Removing hastags
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
                word not in string.punctuation): # remove punctuation
            ##stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(word)
 
    return tweets_clean

In [9]:
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary
    
def get_tokens(tweet):
    words = clean_tweets(tweet)    
    return words 

# positive tweets feature set
pos_tweets_set = []
for tweet in pos_string:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    
    
## negative tweets
neg_tweets_set = []
for tweet in neg_string:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))
    
## general tweets

gen_tweets_set = []
for tweet in gen_string:
    gen_tweets_set.append((bag_of_words(tweet), 'gen'))

In [10]:
from random import shuffle 
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
 
test_set = pos_tweets_set[:40] + neg_tweets_set[:40] + gen_tweets_set[:40]
train_set = pos_tweets_set[10:] + neg_tweets_set[10:] + gen_tweets_set[10:]
print(len(train_set),len(test_set))

950 120


In [11]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765
 
print (classifier.show_most_informative_features(60))

0.7916666666666666
Most Informative Features
            presidential = True              gen : neg    =     28.3 : 1.0
                  season = True              gen : pos    =     28.2 : 1.0
                   biden = True              neg : gen    =     26.2 : 1.0
                     eve = True              gen : neg    =     24.5 : 1.0
                   major = True              pos : neg    =     23.5 : 1.0
                   backs = True              pos : neg    =     23.5 : 1.0
        berniebeatstrump = True              pos : gen    =     21.8 : 1.0
                 notmeus = True              pos : gen    =     21.8 : 1.0
                   video = True              neg : gen    =     17.4 : 1.0
                   group = True              pos : neg    =     17.1 : 1.0
             legislative = True              neg : gen    =     16.6 : 1.0
                nvcaucus = True              pos : gen    =     16.4 : 1.0
                    like = True              neg : gen 

In [14]:
test_tweet1 = "fsadfkadsfkhasdkhfkhsadfhkdsakhfkds"
custom_tweet_set = bag_of_words(test_tweet1)
prob_result = classifier.prob_classify(custom_tweet_set)
print ('Overall Result: ' + str(prob_result.max())) 
print ('Negative Tweet: ' + str(prob_result.prob("neg"))) 
print ('Positive Tweet: ' + str(prob_result.prob("pos")))
print ('General Tweet: ' + str(prob_result.prob("gen")))
tokens = get_tokens(test_tweet1)

Overall Result: gen
Negative Tweet: 0.29269574356279554
Positive Tweet: 0.35260115606936415
General Tweet: 0.35470310036784025


In [279]:
df1['associated_candidate'] = found_match

In [106]:
test_tweet2 = "Amy Klobuchar"
custom_tweet_set = bag_of_words(test_tweet2)
prob_result = classifier.prob_classify(custom_tweet_set)
print ('Overall Result: ' + str(prob_result.max())) 
print ('Negative Tweet: ' + str(prob_result.prob("neg"))) 
print ('Positive Tweet: ' + str(prob_result.prob("pos")))
print ('General Tweet: ' + str(prob_result.prob("gen")))

Overall Result: pos
Negative Tweet: 0.27209789851501076
Positive Tweet: 0.7249842845130791
General Tweet: 0.0029178169719105093


In [100]:
test_tweet3 = "Bernie sucks!"
custom_tweet_set = bag_of_words(test_tweet3)
prob_result = classifier.prob_classify(custom_tweet_set)
print ('Overall Result: ' + str(prob_result.max())) 
print ('Negative Tweet: ' + str(prob_result.prob("neg"))) 
print ('Positive Tweet: ' + str(prob_result.prob("pos")))
print ('General Tweet: ' + str(prob_result.prob("gen")))

Overall Result: neg
Negative Tweet: 0.7462502013893247
Positive Tweet: 0.2084031718372353
General Tweet: 0.045346626773439175


In [19]:
candidates_full_name = ['Joe Biden', 'Amy Klobuchar', 'Pete Buttigieg', 'Bloomberg', 'Bernie Sanders', 'Elizabeth Warren']
candidates_last_name = ['Biden', 'Klobuchar', 'Buttigieg', 'Bloomberg', 'Sanders', 'Warren']

def get_sentiment(name):
    
    final_results = []
    
    for potential_pres in name:
        custom_tweet_set = bag_of_words(potential_pres)
        prob_result = classifier.prob_classify(custom_tweet_set)
        name = potential_pres
        results = str(prob_result.max()) 
        neg_results = str(prob_result.prob("neg"))
        pos_results = str(prob_result.prob("pos"))
        gen_results = str(prob_result.prob("gen"))
        create_dict = {'Name': name, 'Results': results, 'Negative Probability': neg_results, 'Positive Probability': pos_results,
                      'General Probability': gen_results}
        final_results.append(create_dict)
    
    return final_results

In [20]:
pre_sentiment = get_sentiment(candidates_last_name)

In [25]:
pre_sentiment

[{'Name': 'Biden',
  'Results': 'neg',
  'Negative Probability': '0.5199238815643935',
  'Positive Probability': '0.4560721114220566',
  'General Probability': '0.02400400701354983'},
 {'Name': 'Klobuchar',
  'Results': 'pos',
  'Negative Probability': '0.35476889886497753',
  'Positive Probability': '0.6129692758072477',
  'General Probability': '0.03226182532777457'},
 {'Name': 'Buttigieg',
  'Results': 'pos',
  'Negative Probability': '0.40418160387873053',
  'Positive Probability': '0.4894214832560152',
  'General Probability': '0.10639691286525435'},
 {'Name': 'Bloomberg',
  'Results': 'pos',
  'Negative Probability': '0.4587476789474414',
  'Positive Probability': '0.4824200846139234',
  'General Probability': '0.05883223643863472'},
 {'Name': 'Sanders',
  'Results': 'pos',
  'Negative Probability': '0.14620690433217434',
  'Positive Probability': '0.7470681352142821',
  'General Probability': '0.10672496045354342'},
 {'Name': 'Warren',
  'Results': 'pos',
  'Negative Probability

In [28]:
pre_df = pd.DataFrame(pre_sentiment)

In [290]:
pre_df.to_csv('pre_debate_candidate_sentiment.csv')

In [291]:
## Create the full file with all the predictions on it 

full_file = df1['text']

predictor = []

for i in full_file:
    full_tweets = bag_of_words(i)
    prob_result = classifier.prob_classify(full_tweets)
    predictor.append(str(prob_result.max()))
    
df1['predictor'] = predictor

positives = df1[df1['predictor']=='pos']
negatives = df1[df1['predictor']=='neg']
general = df1[df1['predictor']=='gen']

In [292]:
print("The amount of predicted general tweets were: {} \nThe amount of predicted positive tweets were: {} \nThe amount of predicted negative tweets were: {}".format(len(general),len(positives),len(negatives)))

The amount of predicted general tweets were: 2437 
The amount of predicted positive tweets were: 5217 
The amount of predicted negative tweets were: 6645


In [293]:
candidate_names = ['biden', 'klobuchar', 'buttigieg', 'bloomberg', 'sanders', 'warren',
                  'joe biden', 'amy klobuchar', 'pete buttigieg', 'michael bloomberg',
                   'bernie sanders', 'elizabeth warren',
                  'joe','amy','pete','michael','bernie','liz','elizabeth','bernard']
list_comp = []
found_match = []

for i,v in df1.iterrows():
    text = v['text']
    tokenize = get_tokens(text)
    list_comp = [name for name in tokenize if name in candidate_names]
    found_match.append(list_comp)

In [294]:
df1['associated_candidate'] = found_match

In [295]:
df1

Unnamed: 0,general,pos_sentiment,pos_cand1,pos_cand2,pos_cand3,pos_cand4,neg_sentiment,neg_cand1,neg_cand2,neg_cand3,...,favorites,username,text,geo,mentions,hashtags,id,permalink,associated_candidate,predictor
0,no,yes,Sanders,,,,no,,,,...,0,DeannaLee1369,Bernie Sanders Gains Endorsement From Latinx G...,,,,1.229957e+18,https://twitter.com/DeannaLee1369/status/12299...,"[bernie, sanders]",pos
1,yes,no,,,,,no,,,,...,0,8NewsNow,Today is the final day of early voting ahead o...,,,#8NNhttps,1.229956e+18,https://twitter.com/8NewsNow/status/1229956270...,[],gen
2,no,yes,Sanders,,,,no,,,,...,15,BilldeBlasio,The #NevadaCaucus is absolutely crucial to @Be...,,@BernieSanders,#NevadaCaucus #UNLV,1.229956e+18,https://twitter.com/BilldeBlasio/status/122995...,[],pos
3,no,yes,Sanders,,,,no,,,,...,0,SalKappa,#PresidentSanders #BernieWonIowa #BernieWonNew...,,,#PresidentSanders #BernieWonIowa #BernieWonNew...,1.229956e+18,https://twitter.com/SalKappa/status/1229956122...,"[bernie, sanders]",pos
4,yes,,,,,,,,,,...,0,chad_b_morrow,Absolutely absurd how long the line is for ear...,,,#NevadaCaucus,1.229956e+18,https://twitter.com/chad_b_morrow/status/12299...,[],neg
5,no,no,,,,,yes,Buttigieg,Trump,,...,0,DestiGrace1,Lordie the discussion is about dirty AF #Corru...,,@maddow,#CorruptBarr #Maddow #PeteButtigieg #DisBarr #...,1.229956e+18,https://twitter.com/DestiGrace1/status/1229955...,[],neg
6,no,no,,,,,yes,Buttigieg,,,...,2,iamwillkeating,#MayorPete Buttigieg just said his religious f...,,,#MayorPete #NevadaCaucus #2020election #CNNTow...,1.229955e+18,https://twitter.com/iamwillkeating/status/1229...,[buttigieg],neg
7,no,yes,Buttigieg,,,,no,,,,...,0,Magsaliciouss,Just picked up my Precinct Captain box for Sat...,,@PeteButtigieg,#TeamPete,1.229955e+18,https://twitter.com/Magsaliciouss/status/12299...,[],pos
8,no,yes,Sanders,,,,no,,,,...,1,GoHawksThe12,Bernie is now the ONLY candidate with NO billi...,,,#warren2020 #Bernie2020 #PetesBillionaires #Wa...,1.229955e+18,https://twitter.com/GoHawksThe12/status/122995...,"[bernie, warren, bernie, biden, biden]",pos
9,no,yes,Sanders,,,,yes,Bloomberg,,,...,0,pearshavehearti,#BernieSanders2020 #NotMeUs #Bernie2020 #Berni...,,,#BernieSanders2020 #NotMeUs #Bernie2020 #Berni...,1.229955e+18,https://twitter.com/pearshavehearti/status/122...,[bernie],pos


In [146]:
import pickle
filename = 'pre_debate_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [296]:
df1.to_csv('pre_debate_predicted_new_with_candidate.csv')

In [None]:
df1

In [309]:
## Predict results of the post debate tweet set

df2 = pd.read_csv('nevada_caucus_2_19_post_debate.csv', encoding='utf8')

post_debate = df2['text']

post_debate_predictor = []

for i in post_debate:
    full_tweets = bag_of_words(i)
    prob_result = classifier.prob_classify(full_tweets)
    post_debate_predictor.append(str(prob_result.max()))
    
df2['predictor'] = post_debate_predictor

df2.to_csv('post_debate_file.csv')

In [305]:
## Predict results of the pre-election set 

df3 = pd.read_csv('nevada_caucus_pre_election_2_21.csv', encoding='utf8')

pre_election = df3['text']

pre_election_pred = []

for i in pre_election:
    full_tweets = bag_of_words(i)
    prob_result = classifier.prob_classify(full_tweets)
    pre_election_pred.append(str(prob_result.max()))
    
df3['predictor'] = pre_election_pred

df3.to_csv('pre_election_file.csv')

In [308]:
## Predict results of the post-election set

df4 = pd.read_csv('nevada_caucus_post_election_2_24.csv', encoding='utf8')

post_elec = df4['text']

post_elec_pred = []

for i in post_elec:
    full_tweets = bag_of_words(i)
    prob_result = classifier.prob_classify(full_tweets)
    post_elec_pred.append(str(prob_result.max()))
    
df4['predictor'] = post_elec_pred

df4.to_csv('post_election_file.csv')

In [311]:
pre_elect_df = pd.read_csv('pre_election_file.csv')

In [312]:
post_elect_df = pd.read_csv('post_election_file.csv')

In [314]:
post_elect_df.head()

Unnamed: 0.1,Unnamed: 0,date,username,to,replies,retweets,favorites,text,geo,mentions,hashtags,id,permalink,predictor
0,0,2020-02-25 1:23:21,Stanlee011,,0,0,0,"Nevada Caucus Final Results: Bernie 47%, Biden...",,@thelastrefuge2,,1.232114e+18,https://twitter.com/Stanlee011/status/12321138...,pos
1,1,2020-02-25 1:22:52,KayanaMaree,,0,0,0,The ladies at @TheView just don't get it!!! Th...,,@TheView @YouTube,,1.232114e+18,https://twitter.com/KayanaMaree/status/1232113...,pos
2,2,2020-02-25 1:22:49,scldef22,,0,0,0,@RedEaglePatriot Nevada caucus analysis?,,@RedEaglePatriot,,1.232114e+18,https://twitter.com/scldef22/status/1232113689...,gen
3,3,2020-02-25 1:22:12,BlackAmCaucus,,0,1,2,"Nevada Democratic Caucus Turnout: 2020 - 105,1...",,,#NevadaCaucus,1.232114e+18,https://twitter.com/BlackAmCaucus/status/12321...,gen
4,4,2020-02-25 1:21:25,uniquechoices,kristenluvslife,0,0,0,Not just that won around 27K out of 3M in #Nev...,,,#NevadaCaucus,1.232113e+18,https://twitter.com/uniquechoices/status/12321...,pos


In [315]:
def create_array(dataframe):
    candidate_names = ['biden', 'klobuchar', 'buttigieg', 'bloomberg', 'sanders', 'warren',
                      'joe biden', 'amy klobuchar', 'pete buttigieg', 'michael bloomberg',
                       'bernie sanders', 'elizabeth warren',
                      'joe','amy','pete','michael','bernie','liz','elizabeth','bernard']
    list_comp = []
    found_match = []

    for i,v in dataframe.iterrows():
        text = v['text']
        tokenize = get_tokens(text)
        list_comp = [name for name in tokenize if name in candidate_names]
        found_match.append(list_comp)
        
    dataframe['associated_candidate'] = found_match
    return dataframe

In [316]:
final_pre_elect_df = create_array(pre_elect_df)

In [318]:
final_post_elect_df = create_array(post_elect_df)

In [319]:
final_pre_elect_df.to_csv('pre_election_predicted.csv')

In [320]:
final_post_elect_df.to_csv('post_election_predicted.csv')