### Training the model

In [1]:
import string
import re
import json
import pickle
import pandas as pd
import nltk
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [2]:
#if heavier emoji stripper is needed

# def remove_emoji(string):
#     emoji_pattern = re.compile("["
#                                u"\U0001F600-\U0001F64F"  # emoticons
#                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                                u"\U00002500-\U00002BEF"  # chinese char
#                                u"\U00002702-\U000027B0"
#                                u"\U00002702-\U000027B0"
#                                u"\U000024C2-\U0001F251"
#                                u"\U0001f926-\U0001f937"
#                                u"\U00010000-\U0010ffff"
#                                u"\u2640-\u2642"
#                                u"\u2600-\u2B55"
#                                u"\u200d"
#                                u"\u23cf"
#                                u"\u23e9"
#                                u"\u231a"
#                                u"\ufe0f"  # dingbats
#                                u"\u3030"
#                                "]+", flags=re.UNICODE)
#     return emoji_pattern.sub(r'', string)

In [2]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def tokenization(text):
    text = re.split('\W+', text)
    return text

def remove_urls(text):
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text)
    return text

def remove_mentions(text):
    text = re.sub(r'(^|[^@\w])@(\w{1,15})\b', '', text)
    return text

def remove_empty_tokens(text):
    text = list(filter(None, text))
    return text

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [53]:
# 0 = negative, 4 = positive

df = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [55]:
df['no_urls'] = df['text'].apply(lambda x: remove_urls(x)) #remove urls
df['unmentioned'] = df['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
df['depunctualized'] = df['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
df['tokenized'] = df['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
df['improved_tokenized'] = df['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
df['nonstop'] = df['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
#df['stemmed'] = df['nonstop'].apply(lambda x: stemming(x)) #stem the tokens
df['rejoined'] = df['nonstop'].apply(lambda x: " ".join(x)) #rejoin for vectorization
df.head()

Unnamed: 0,target,id,date,flag,user,text,no_urls,unmentioned,depunctualized,tokenized,improved_tokenized,nonstop,rejoined
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","@switchfoot - Awww, that's a bummer. You sho...","- Awww, that's a bummer. You shoulda got Da...",Awww thats a bummer You shoulda got David ...,"[, awww, thats, a, bummer, you, shoulda, got, ...","[awww, thats, a, bummer, you, shoulda, got, da...","[awww, thats, bummer, shoulda, got, david, car...",awww thats bummer shoulda got david carr third...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...,"[is, upset, that, he, cant, update, his, faceb...","[is, upset, that, he, cant, update, his, faceb...","[upset, cant, update, facebook, texting, might...",upset cant update facebook texting might cry r...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...,I dived many times for the ball Managed to sa...,"[, i, dived, many, times, for, the, ball, mana...","[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...",dived many times ball managed save rest go bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]",whole body feels itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...",no its not behaving at all im mad why am i he...,"[, no, its, not, behaving, at, all, im, mad, w...","[no, its, not, behaving, at, all, im, mad, why...","[behaving, im, mad, cant, see]",behaving im mad cant see


In [56]:
#extract to numpy arrays

y = df.target.to_numpy()
X = df.rejoined.to_numpy()

In [57]:
#split into train (80%) and test (20%)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_t = X[train_index], X[test_index]
    y_train, y_t = y[train_index], y[test_index]

TRAIN: [ 693767 1081409  678396 ...   87567 1393165   31307] TEST: [1442636   44280  702174 ...  813844 1180632 1077844]


In [58]:
#split test into test (10%) and val (10%)

sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.5, train_size=0.5)

for test_i, val_i in sss_val.split(X_t, y_t):
    print("TEST:", test_i, "Val:", val_i)
    X_test, X_val = X_t[test_i], X_t[val_i]
    y_test, y_val = y_t[test_i], y_t[val_i]

TEST: [293622 252902 281338 ... 263641 233725 292555] Val: [294995 131589 262548 ... 114874 102871 184032]


In [59]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape) #check if split sizes are correct

(1280000,) (1280000,) (160000,) (160000,) (160000,) (160000,)


In [73]:
#vectorize the tweets

# cv = CountVectorizer(binary=True)
# cv.fit(X_train) #fit the vectorizer

#X_train_vectorized = cv.transform(X_train)
X_test_vectorized = cv.transform(X_test)
X_val_vectorized = cv.transform(X_val)

In [18]:
y_pred = MultinomialNB().fit(X_train_vectorized, y_train).predict(X_val_vectorized)
print("Number of mislabeled points out of a total %d points : %d"
       % (X_val_vectorized.shape[0], (y_val != y_pred).sum()))

Number of mislabeled points out of a total 160000 points : 37394


In [61]:
# mnb = MultinomialNB().fit(X_train_vectorized, y_train) #final trained model
# mnb_predictions = mnb.predict(X_val_vectorized)
mnb_accuracy = mnb.score(X_test_vectorized, y_test)
print(mnb_accuracy)

0.2673125


In [70]:
for i in y_test:
    if i == 1:
        print('what')

In [65]:
mnb_predictions = mnb.predict(X_test_vectorized)
transformed_mnb_predictions = [0 if prediction == True else 4 for prediction in mnb_predictions]

In [69]:
print(metrics.classification_report(y_test, mnb_predictions, digits=3, target_names=['negative sentiment', 'positive sentiment']))

              precision    recall  f1-score   support

           0      0.397     0.535     0.455     80000
           1      0.000     0.000     0.000         0
           4      0.000     0.000     0.000     80000

   micro avg      0.267     0.267     0.267    160000
   macro avg      0.132     0.178     0.152    160000
weighted avg      0.198     0.267     0.228    160000



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [53]:
#save trained model as pickle

# filename = 'MNB_sentiment_classifier_model_lemmatized.sav'
# with open(filename, 'wb') as f_out:
#     pickle.dump((mnb, cv), f_out)

In [86]:
#stemmed model

#load the model from disk
filename = 'models/MNB_sentiment_classifier_model_lemmatized.sav'

stemmed_model, stemmed_cv = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

X_test_vectorized = stemmed_cv.transform(X_test) #transform with pretrained vectorizer
X_val_vectorized = stemmed_cv.transform(X_val) #transform with pretrained vectorizer

stemmed_accuracy = stemmed_model.score(X_val_vectorized, y_val) #calculate accuracy
print(stemmed_accuracy)

0.74414375


In [87]:
stemmed_predictions = stemmed_model.predict(X_test_vectorized) #let the model predict on unseen test set
print(metrics.classification_report(y_test, stemmed_predictions, digits=3, target_names=['negative sentiment', 'positive sentiment']))

                    precision    recall  f1-score   support

negative sentiment      0.743     0.749     0.746     80000
positive sentiment      0.747     0.741     0.744     80000

         micro avg      0.745     0.745     0.745    160000
         macro avg      0.745     0.745     0.745    160000
      weighted avg      0.745     0.745     0.745    160000



In [83]:
#unstemmed model

#load the model from disk
filename = 'models/MNB_sentiment_classifier_model.sav'

unstemmed_model, unstemmed_cv = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

X_test_vectorized = unstemmed_cv.transform(X_test) #transform with pretrained vectorizer
X_val_vectorized = unstemmed_cv.transform(X_val) #transform with pretrained vectorizer

unstemmed_accuracy = unstemmed_model.score(X_val_vectorized, y_val) calculate accuracy
print(unstemmed_accuracy)

0.78853125


In [84]:
unstemmed_predictions = unstemmed_model.predict(X_test_vectorized) #let the model predict on unseen test set
print(metrics.classification_report(y_test, unstemmed_predictions, digits=3, target_names=['negative sentiment', 'positive sentiment']))

                    precision    recall  f1-score   support

negative sentiment      0.767     0.832     0.798     80000
positive sentiment      0.816     0.748     0.780     80000

         micro avg      0.790     0.790     0.790    160000
         macro avg      0.792     0.790     0.789    160000
      weighted avg      0.792     0.790     0.789    160000



### Testing the model on provided tweet dataset

In [92]:
#take a subset of 30 tweets

jsonlist = []
c = 0
with open('data/geotagged_tweets_20160812-0912.jsons', 'r', encoding='utf-8') as f:
    for line in f:
        c+=1
        jsonlist.append(json.loads(line))
        if c == 5000:
            break

In [105]:
jsonlist[1]['entities']['hashtags']

[{'text': 'NOJUSTICE', 'indices': [62, 72]},
 {'text': 'TrumpPence', 'indices': [93, 104]}]

In [16]:
jsonlist[3]['entities']['user_mentions']

[{'screen_name': 'theblaze',
  'name': 'TheBlaze',
  'id': 10774652,
  'id_str': '10774652',
  'indices': [0, 9]},
 {'screen_name': 'realDonaldTrump',
  'name': 'Donald J. Trump',
  'id': 25073877,
  'id_str': '25073877',
  'indices': [10, 26]}]

In [32]:
test_input = [tweet['text'] for tweet in jsonlist]

In [33]:
#clean the subset of tweets

testframe = pd.DataFrame(test_input, columns=['text'])
testframe['no_urls'] = testframe['text'].apply(lambda x: remove_urls(x)) #remove urls
testframe['unmentioned'] = testframe['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
testframe['depunctualized'] = testframe['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
testframe['tokenized'] = testframe['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
testframe['improved_tokenized'] = testframe['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
testframe['nonstop'] = testframe['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
testframe['stemmed'] = testframe['nonstop'].apply(lambda x: stemming(x))
testframe['rejoined'] = testframe['stemmed'].apply(lambda x: " ".join(x))
testframe.head()

Unnamed: 0,text,no_urls,unmentioned,depunctualized,tokenized,improved_tokenized,nonstop,stemmed,rejoined
0,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,@theblaze @realDonaldTrump,,,"[, ]",[],[],[],
1,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,\nALL IN COLLUSION TOGETHER \n\n#NOJUSTICE \...,\nALL IN COLLUSION TOGETHER \n\nNOJUSTICE \n...,"[, all, in, collusion, together, nojustice, tr...","[all, in, collusion, together, nojustice, trum...","[collusion, together, nojustice, trumppence]","[collus, togeth, nojustic, trumppenc]",collus togeth nojustic trumppenc
2,@theblaze @realDonaldTrump https://t.co/n050DB...,@theblaze @realDonaldTrump,,,"[, ]",[],[],[],
3,@HillaryClinton he will do in one year all the...,@HillaryClinton he will do in one year all the...,he will do in one year all the things you sho...,he will do in one year all the things you sho...,"[, he, will, do, in, one, year, all, the, thin...","[he, will, do, in, one, year, all, the, things...","[one, year, things, done, eight]","[one, year, thing, done, eight]",one year thing done eight
4,#CNN #newday clear #Trump deliberately throwin...,#CNN #newday clear #Trump deliberately throwin...,#CNN #newday clear #Trump deliberately throwin...,CNN newday clear Trump deliberately throwing t...,"[cnn, newday, clear, trump, deliberately, thro...","[cnn, newday, clear, trump, deliberately, thro...","[cnn, newday, clear, trump, deliberately, thro...","[cnn, newday, clear, trump, deliber, throw, ra...",cnn newday clear trump deliber throw racein kn...


In [44]:
testinput = [tweet for tweet in testframe.rejoined.values if tweet] #drop empty tweets
testinput

['lorettalynch collus togeth nojustic trumppenc',
 'one year thing done eight',
 'cnn newday clear trump deliber throw racein knew isi destabil mideast start wiraq invas',
 'wouldnt recogn lie came mouth continu nevertrump',
 'trump trumppenc makeamericagreatagain',
 'kid know su someon that beauti thing human could anoth human',
 'cofound isi crook evil lie witch live',
 'want comparison tri maim vet pre amp post iraq pullout bar graph',
 'total concur elect cra cra n corrupt gov mind blow trump last hope',
 'issu idiot claim found isi trump go hell lie amp steal shame',
 'cant stand ortak look windont settl teamgov youin',
 'isnt rape alleg get attent caus seem probabl',
 'stole white hous furnitur',
 'gop plead w trump control behavior week want year terrifi nevertrump crazi',
 'isi cofound hillari clinton obama also devil hillari sit left hand devil',
 'come jesu meet earth suppos',
 'hanniti think disbar ignor mr hamburg dishonest',
 'stop worri msm lie focu econampimmampdfn ur sp

In [46]:
#vectorize with the fitted vectorizer

test_input_vectorized = cv.transform(testinput)

In [47]:
test = mnb.predict(test_input_vectorized) #predict the sentiment of the tweets

In [62]:
#sentiment and stemmed tweet

output = pd.DataFrame(testinput, test).reset_index()
output.columns = ['sentiment', 'tweet']
output

Unnamed: 0,sentiment,tweet
0,4,lorettalynch collus togeth nojustic trumppenc
1,0,one year thing done eight
2,0,cnn newday clear trump deliber throw racein kn...
3,0,wouldnt recogn lie came mouth continu nevertrump
4,4,trump trumppenc makeamericagreatagain
5,4,kid know su someon that beauti thing human cou...
6,0,cofound isi crook evil lie witch live
7,0,want comparison tri maim vet pre amp post iraq...
8,0,total concur elect cra cra n corrupt gov mind ...
9,0,issu idiot claim found isi trump go hell lie a...


In [52]:
#pre cleaned tweet

for i in testframe.text:
    print(i)

@theblaze @realDonaldTrump https://t.co/TY9DlZ584c
@BarackObama @FBI@LORETTALYNCH ALL IN COLLUSION TOGETHER #NOJUSTICE @realDonaldTrump #TrumpPence https://t.co/5GMNZq40V3
@theblaze @realDonaldTrump https://t.co/n050DBSpv0
@HillaryClinton he will do in one year all the things you should have done in eight
#CNN #newday clear #Trump deliberately throwing this race,in 2007 he knew that #ISIS and destabilization of Mideast started w/Iraq invasion
@realDonaldTrump, you wouldn't recognize a lie if it came from your own mouth, and they do continually. #NeverTrump https://t.co/pKSQM8yikm
#Trump2016 #TrumpPence16 #MakeAmericaGreatAgain  https://t.co/l5UsYANVc9
"Kid, you know, suing someone? Thats the most beautiful thing 1 human being could do to another human being" @funnyordie @realDonaldTrump😂💩s
@HillaryClinton you ARE the co-founder of ISIS, you crooked, evil, lying, witch. How can you live with yourself?
@Geraldanthro @NeilTurner_ @realDonaldTrump want to do a comparison try maimed Vets pr

### Topic modelling

In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim import corpora, models
from pprint import pprint
stemmer = SnowballStemmer('english')

In [4]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [107]:
#all hashtags and locations
hashtags = {}

with open('data/geotagged_tweets_20160812-0912.jsons', 'r', encoding='utf-8') as f:
    for line in f:
        tweet = json.loads(line)
        if len(tweet['entities']['hashtags']) > 0:
            hashtags[tweet['id']] = {} #make a nested dict for every tweet
            hashtaglist = [] #make empty list for hashtags
            for item in tweet['entities']['hashtags']: #loop through hashtags in tweet
                hashtaglist.append(item['text'])
            hashtags[tweet['id']]['hashtags'] = hashtaglist
            try:
                hashtags[tweet['id']]['location'] = tweet['place']['full_name']
            except TypeError:
                continue
            

#clean the subset of tweets
hframe = pd.DataFrame.from_dict(hashtags).T
hframe.head()

Unnamed: 0,hashtags,location
764039733076897792,"[NOJUSTICE, TrumpPence]","Baton Rouge, LA"
764039849850482689,"[CNN, newday, Trump, ISIS]","Baltimore, MD"
764039917924069376,[NeverTrump],"Palm Springs, CA"
764039925146742784,"[Trump2016, TrumpPence16, MakeAmericaGreatAgain]","Hammersmith, London"
764039994247819264,"[15for15, TeamGov, YouIn]","Middletown, KY"


In [109]:
hframe.to_pickle('data/hashtags_locations.pkl')

In [24]:
#take the text of all tweets
clinton_tweets = []
trump_tweets = []

#maybe add obama too?

with open('data/geotagged_tweets_20160812-0912.jsons', 'r', encoding='utf-8') as f:
    for line in f:
        tweet = json.loads(line)
        mentionlist = []
        for mention in tweet['entities']['user_mentions']:
            mentionlist.append(mention['id'])
        if 1339835893 in mentionlist and 25073877 not in mentionlist:
            clinton_tweets.append(tweet['text'])
        if 25073877 in mentionlist and 1339835893 not in mentionlist:
            trump_tweets.append(tweet['text'])
            
            

#clean the subset of tweets
cframe = pd.DataFrame(clinton_tweets, columns=['text'])
cframe['no_urls'] = cframe['text'].apply(lambda x: remove_urls(x)) #remove urls
cframe['unmentioned'] = cframe['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
cframe['depunctualized'] = cframe['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
cframe['lemmed'] = cframe['depunctualized'].apply(lambda x: preprocess(x)) #lemmatize
cframe.head()


tframe = pd.DataFrame(trump_tweets, columns=['text'])
tframe['no_urls'] = tframe['text'].apply(lambda x: remove_urls(x)) #remove urls
tframe['unmentioned'] = tframe['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
tframe['depunctualized'] = tframe['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
tframe['lemmed'] = tframe['depunctualized'].apply(lambda x: preprocess(x)) #lemmatize
tframe.head()

Unnamed: 0,text,no_urls,unmentioned,depunctualized,lemmed
0,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,@theblaze @realDonaldTrump,,,[]
1,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,\nALL IN COLLUSION TOGETHER \n\n#NOJUSTICE \...,\nALL IN COLLUSION TOGETHER \n\nNOJUSTICE \n...,"[collus, nojustic, trumppenc]"
2,@theblaze @realDonaldTrump https://t.co/n050DB...,@theblaze @realDonaldTrump,,,[]
3,"@realDonaldTrump, you wouldn't recognize a lie...","@realDonaldTrump, you wouldn't recognize a lie...",", you wouldn't recognize a lie if it came from...",you wouldnt recognize a lie if it came from y...,"[wouldnt, recogn, come, mouth, continu, nevert..."
4,"""Kid, you know, suing someone? Thats the most ...","""Kid, you know, suing someone? Thats the most ...","""Kid, you know, suing someone? Thats the most ...",Kid you know suing someone Thats the most beau...,"[know, sue, that, beauti, thing, human, human]"


In [59]:
#with location added

#take the text of all tweets
clinton_tweets = {}
trump_tweets = {}

with open('data/geotagged_tweets_20160812-0912.jsons', 'r', encoding='utf-8') as f:
    for line in f:
        tweet = json.loads(line)
        mentionlist = []
        for mention in tweet['entities']['user_mentions']:
            mentionlist.append(mention['id'])
        if 1339835893 in mentionlist and 25073877 not in mentionlist:
            clinton_tweets[tweet['id']] = {} 
            clinton_tweets[tweet['id']]['text'] = tweet['text']
            try:
                clinton_tweets[tweet['id']]['location'] = tweet['place']['full_name']
            except TypeError:
                continue
        if 25073877 in mentionlist and 1339835893 not in mentionlist:
            trump_tweets[tweet['id']] = {} 
            trump_tweets[tweet['id']]['text'] = tweet['text']
            try:
                trump_tweets[tweet['id']]['location'] = tweet['place']['full_name']
            except TypeError:
                continue
            

#clean the subset of tweets
cframe = pd.DataFrame.from_dict(clinton_tweets).T
cframe['no_urls'] = cframe['text'].apply(lambda x: remove_urls(x)) #remove urls
cframe['unmentioned'] = cframe['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
cframe['depunctualized'] = cframe['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
cframe['lemmed'] = cframe['depunctualized'].apply(lambda x: preprocess(x)) #lemmatize
cframe.head()


tframe = pd.DataFrame.from_dict(trump_tweets).T
tframe['no_urls'] = tframe['text'].apply(lambda x: remove_urls(x)) #remove urls
tframe['unmentioned'] = tframe['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
tframe['depunctualized'] = tframe['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
tframe['lemmed'] = tframe['depunctualized'].apply(lambda x: preprocess(x)) #lemmatize
tframe.head()

Unnamed: 0,location,text,no_urls,unmentioned,depunctualized,lemmed
764039724818272256,"Frontenac, MO",@theblaze @realDonaldTrump https://t.co/TY9DlZ...,@theblaze @realDonaldTrump,,,[]
764039733076897792,"Baton Rouge, LA",@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,\nALL IN COLLUSION TOGETHER \n\n#NOJUSTICE \...,\nALL IN COLLUSION TOGETHER \n\nNOJUSTICE \n...,"[collus, nojustic, trumppenc]"
764039769244348417,"Frontenac, MO",@theblaze @realDonaldTrump https://t.co/n050DB...,@theblaze @realDonaldTrump,,,[]
764039917924069376,"Palm Springs, CA","@realDonaldTrump, you wouldn't recognize a lie...","@realDonaldTrump, you wouldn't recognize a lie...",", you wouldn't recognize a lie if it came from...",you wouldnt recognize a lie if it came from y...,"[wouldnt, recogn, come, mouth, continu, nevert..."
764039926161604608,"Secaucus, NJ","""Kid, you know, suing someone? Thats the most ...","""Kid, you know, suing someone? Thats the most ...","""Kid, you know, suing someone? Thats the most ...",Kid you know suing someone Thats the most beau...,"[know, sue, that, beauti, thing, human, human]"


In [27]:
# with open('data/candidate_tweets.pkl', 'wb') as f:
#     pickle.dump((clinton_tweets, trump_tweets), f)

In [79]:
filename = 'models/MNB_sentiment_classifier_model.sav'

mnb, cv = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

#mnb_accuracy = mnb.score(X_val_vectorized, y_val)
#print(mnb_accuracy)

In [80]:
cframe['tokenized'] = cframe['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
cframe['improved_tokenized'] = cframe['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
cframe['nonstop'] = cframe['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
cframe['nw_input'] = cframe['nonstop'].apply(lambda x: " ".join(x)) #rejoin for vectorization

tframe['tokenized'] = tframe['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
tframe['improved_tokenized'] = tframe['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
tframe['nonstop'] = tframe['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
tframe['nw_input'] = tframe['nonstop'].apply(lambda x: " ".join(x)) #rejoin for vectorization

In [81]:
clinton = cframe[cframe.nw_input != ''].copy()
trump = tframe[tframe.nw_input != ''].copy()

clinton_vectorized = cv.transform(clinton.nw_input.to_numpy())
trump_vectorized = cv.transform(trump.nw_input.to_numpy())

In [82]:
clinton_predictions = mnb.predict(clinton_vectorized)
trump_predictions = mnb.predict(trump_vectorized)

transformed_clinton_predictions = ['Negative' if prediction == 0 else 'Positive' for prediction in clinton_predictions]
transformed_trump_predictions = ['Negative' if prediction == 0 else 'Positive' for prediction in trump_predictions]

In [84]:
clinton['sentiment'] = transformed_clinton_predictions
trump['sentiment'] = transformed_trump_predictions

In [87]:
# transformdict = {'tweet': clinton, 'sentiment': transformed_clinton_predictions}
# clinton_sentiment = pd.DataFrame(transformdict)
# clinton_sentiment.head()

# transformdict = {'tweet': trump, 'sentiment': transformed_trump_predictions}
# trump_sentiment = pd.DataFrame(transformdict)
# trump_sentiment.head()

clinton.to_pickle('clinton_with_location.pkl')
trump.to_pickle('trump_with_location.pkl')

In [40]:
clinton_sentiment.head(20)

Unnamed: 0,tweet,sentiment
0,one year things done eight,Positive
1,cofounder isis crooked evil lying witch live,Negative
2,isis cofounder hillary clinton obama also devi...,Negative
3,hannity think disbarred ignorant mr hamburg di...,Negative
4,true us officially selling arms isis,Positive
5,wake dems hillary cofounder isis,Negative
6,isis policies created uprising youareresponsib...,Positive
7,lady hillary best president thank much,Positive
8,way today always,Positive
9,corrupt perjurer treason antun soldout america...,Negative


In [41]:
trump_sentiment.head(20)

Unnamed: 0,tweet,sentiment
0,collusion together nojustice trumppence,Negative
1,wouldnt recognize lie came mouth continually n...,Negative
2,kid know suing someone thats beautiful thing h...,Positive
3,want comparison try maimed vets pre amp post i...,Positive
4,totally concur election cra cra n corruption g...,Negative
5,issues idiot claim founded isis trump go hell ...,Negative
6,isnt rape allegations getting attention cause ...,Negative
7,stop worrying msm lies focus econampimmampdfns...,Negative
8,morningjoe wonder mentality members laughed am...,Positive
9,yr crazy performance due dementia clever plot ...,Negative


In [42]:
# clinton_sentiment.to_pickle('data/clinton_sentiment.pkl')
# trump_sentiment.to_pickle('data/trump_sentiment.pkl')

In [7]:
clinton_sentiment = pd.read_pickle('data/clinton_sentiment.pkl')
trump_sentiment = pd.read_pickle('data/trump_sentiment.pkl')
print(clinton_sentiment.shape, trump_sentiment.shape)

(102744, 2) (273298, 2)


#### Training the LDA model

In [8]:
with open('data/geotagged_tweets_20160812-0912.jsons', 'r', encoding='utf-8') as f:
    all_tweets = [json.loads(line)['text'] for line in f]
            
            

#clean the subset of tweets
allframe = pd.DataFrame(all_tweets, columns=['text'])
allframe['no_urls'] = allframe['text'].apply(lambda x: remove_urls(x)) #remove urls
allframe['unmentioned'] = allframe['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
allframe['depunctualized'] = allframe['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
allframe['lemmed'] = allframe['depunctualized'].apply(lambda x: preprocess(x)) #lemmatize
allframe.head()

Unnamed: 0,text,no_urls,unmentioned,depunctualized,lemmed
0,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,@theblaze @realDonaldTrump,,,[]
1,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,\nALL IN COLLUSION TOGETHER \n\n#NOJUSTICE \...,\nALL IN COLLUSION TOGETHER \n\nNOJUSTICE \n...,"[collus, nojustic, trumppenc]"
2,@theblaze @realDonaldTrump https://t.co/n050DB...,@theblaze @realDonaldTrump,,,[]
3,@HillaryClinton he will do in one year all the...,@HillaryClinton he will do in one year all the...,he will do in one year all the things you sho...,he will do in one year all the things you sho...,"[year, thing]"
4,#CNN #newday clear #Trump deliberately throwin...,#CNN #newday clear #Trump deliberately throwin...,#CNN #newday clear #Trump deliberately throwin...,CNN newday clear Trump deliberately throwing t...,"[newday, clear, trump, deliber, throw, racein,..."


In [150]:
dictionary = gensim.corpora.Dictionary(trump_negative['lemmed'])
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 come
1 continu
2 mouth
3 nevertrump
4 recogn
5 wouldnt
6 comparison
7 graph
8 iraq
9 maim
10 post


In [151]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [152]:
bow_corpus = [dictionary.doc2bow(doc) for doc in trump_negative['lemmed']]
bow_corpus[4310]

[(20, 1), (124, 1), (271, 2), (327, 1), (1146, 1), (1420, 1)]

In [153]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 20 ("trump") appears 1 time.
Word 124 ("go") appears 1 time.
Word 271 ("wont") appears 2 time.
Word 327 ("lose") appears 1 time.
Word 1146 ("differ") appears 1 time.
Word 1420 ("ryan") appears 1 time.


In [154]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [155]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.044*"tax" + 0.042*"releas" + 0.038*"return" + 0.032*"lie" + 0.025*"trump" + 0.025*"hide" + 0.023*"speech" + 0.017*"medic" + 0.015*"record" + 0.014*"putin"
Topic: 1 
Words: 0.057*"hillari" + 0.054*"trump" + 0.036*"clinton" + 0.019*"crook" + 0.014*"corrupt" + 0.014*"money" + 0.013*"obama" + 0.013*"email" + 0.012*"plan" + 0.012*"right"
Topic: 2 
Words: 0.036*"fuckyoudonald" + 0.029*"racist" + 0.027*"idiot" + 0.024*"didnt" + 0.024*"fuck" + 0.024*"shit" + 0.017*"ignor" + 0.015*"america" + 0.015*"disgust" + 0.011*"hell"
Topic: 3 
Words: 0.077*"like" + 0.032*"look" + 0.025*"deplor" + 0.020*"sick" + 0.019*"hillari" + 0.015*"need" + 0.012*"lose" + 0.011*"person" + 0.010*"debat" + 0.010*"call"
Topic: 4 
Words: 0.030*"news" + 0.018*"shame" + 0.014*"report" + 0.014*"loser" + 0.013*"hat" + 0.012*"live" + 0.012*"campaign" + 0.011*"probabl" + 0.011*"support" + 0.010*"school"
Topic: 5 
Words: 0.034*"liar" + 0.025*"tell" + 0.022*"believ" + 0.020*"wrong" + 0.019*"know" + 0.019*"away" 

In [116]:
# tfidf = models.TfidfModel(bow_corpus)
# corpus_tfidf = tfidf[bow_corpus]

# for doc in corpus_tfidf:
#     pprint(doc)
#     break

[]


In [118]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.080*"today" + 0.069*"trump" + 0.067*"love" + 0.043*"help" + 0.040*"pressur" + 0.040*"fine" + 0.038*"rain" + 0.037*"forecast" + 0.037*"tempcrab" + 0.037*"orchard"
Topic: 1 
Words: 0.053*"maga" + 0.043*"trump" + 0.039*"hillaryclinton" + 0.029*"trumppenc" + 0.027*"truth" + 0.024*"donaldtrump" + 0.021*"tell" + 0.019*"hillari" + 0.018*"elect" + 0.016*"poll"
Topic: 2 
Words: 0.019*"question" + 0.019*"trump" + 0.018*"hillari" + 0.018*"obama" + 0.015*"wrong" + 0.014*"news" + 0.013*"live" + 0.013*"answer" + 0.013*"leader" + 0.012*"press"
Topic: 3 
Words: 0.049*"neverhillari" + 0.040*"crookedhillari" + 0.029*"health" + 0.027*"hillaryshealth" + 0.021*"hillari" + 0.021*"trump" + 0.017*"maga" + 0.015*"hillaryclinton" + 0.015*"women" + 0.014*"apolog"
Topic: 4 
Words: 0.083*"trump" + 0.029*"putin" + 0.021*"donald" + 0.018*"say" + 0.017*"presid" + 0.015*"agre" + 0.013*"plan" + 0.013*"campaign" + 0.012*"russia" + 0.011*"go"
Topic: 5 
Words: 0.056*"clinton" + 0.054*"hillari" + 0.051*"

In [51]:
#Probably not scientifically correct

# lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
# for idx, topic in lda_model_tfidf.print_topics(-1):
#     print('Topic: {} Word: {}'.format(idx, topic))

### Use the LDA model

In [46]:
def getTopicForQuery(tweet):
    ques_vec = []
    ques_vec = dictionary.doc2bow(tweet)

    topic_vec = []
    topic_vec = lda_model[ques_vec]

    word_count_array = np.empty((len(topic_vec), 2), dtype = np.object)
    for i in range(len(topic_vec)):
        word_count_array[i, 0] = topic_vec[i][0]
        word_count_array[i, 1] = topic_vec[i][1]

    idx = np.argsort(word_count_array[:, 1])
    idx = idx[::-1]
    word_count_array = word_count_array[idx]

    final = []
    final = lda_model.print_topic(word_count_array[0, 0], 1)

    question_topic = final.split('*') ## as format is like "probability * topic"

    return question_topic[1]

In [88]:
clinton_negative = clinton.loc[clinton.sentiment=='Negative'].copy()
clinton_positive = clinton.loc[clinton.sentiment=='Positive'].copy()

trump_negative = trump.loc[trump.sentiment=='Negative'].copy()
trump_positive = trump.loc[trump.sentiment=='Positive'].copy()

print(clinton_negative.shape, clinton_positive.shape, trump_negative.shape, trump_positive.shape)

(53262, 11) (49440, 11) (133541, 11) (139651, 11)


In [89]:
print(clinton_negative.shape[0]/clinton_sentiment.shape[0], trump_negative.shape[0]/trump_sentiment.shape[0])

0.518395234758234 0.48862779822757574


In [34]:
clinton_negative['lemmed'] = clinton_negative['tweet'].apply(lambda x: preprocess(x)) #lemmatize
clinton_positive['lemmed'] = clinton_positive['tweet'].apply(lambda x: preprocess(x)) #lemmatize

trump_negative['lemmed'] = trump_negative['tweet'].apply(lambda x: preprocess(x)) #lemmatize
trump_positive['lemmed'] = trump_positive['tweet'].apply(lambda x: preprocess(x)) #lemmatize

In [91]:
clinton_negative.to_pickle('data/clinton_negative.pkl')
clinton_positive.to_pickle('data/clinton_positive.pkl')

trump_negative.to_pickle('data/trump_negative.pkl')
trump_positive.to_pickle('data/trump_positive.pkl')

In [122]:
clinton_negative['lemmed'][764042004137447425]

['corrupt', 'perjur', 'treason', 'antun', 'soldout', 'america']

In [123]:
getTopicForQuery(clinton_negative['lemmed'][764042004137447425])

'"racist"'