### Training the model

In [1]:
import string
import re
import json
import pickle
import pandas as pd
import nltk
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [2]:
#if heavier emoji stripper is needed

# def remove_emoji(string):
#     emoji_pattern = re.compile("["
#                                u"\U0001F600-\U0001F64F"  # emoticons
#                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                                u"\U00002500-\U00002BEF"  # chinese char
#                                u"\U00002702-\U000027B0"
#                                u"\U00002702-\U000027B0"
#                                u"\U000024C2-\U0001F251"
#                                u"\U0001f926-\U0001f937"
#                                u"\U00010000-\U0010ffff"
#                                u"\u2640-\u2642"
#                                u"\u2600-\u2B55"
#                                u"\u200d"
#                                u"\u23cf"
#                                u"\u23e9"
#                                u"\u231a"
#                                u"\ufe0f"  # dingbats
#                                u"\u3030"
#                                "]+", flags=re.UNICODE)
#     return emoji_pattern.sub(r'', string)

In [3]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def tokenization(text):
    text = re.split('\W+', text)
    return text

def remove_urls(text):
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text)
    return text

def remove_mentions(text):
    text = re.sub(r'(^|[^@\w])@(\w{1,15})\b', '', text)
    return text

def remove_empty_tokens(text):
    text = list(filter(None, text))
    return text

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [4]:
# 0 = negative, 4 = positive

df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df['no_urls'] = df['text'].apply(lambda x: remove_urls(x)) #remove urls
df['unmentioned'] = df['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
df['depunctualized'] = df['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
df['tokenized'] = df['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
df['improved_tokenized'] = df['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
df['nonstop'] = df['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
df['stemmed'] = df['nonstop'].apply(lambda x: stemming(x)) #stem the tokens
df['rejoined'] = df['stemmed'].apply(lambda x: " ".join(x)) #rejoin for vectorization
df.head()

Unnamed: 0,target,id,date,flag,user,text,no_urls,unmentioned,depunctualized,tokenized,improved_tokenized,nonstop,stemmed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","@switchfoot - Awww, that's a bummer. You sho...","- Awww, that's a bummer. You shoulda got Da...",Awww thats a bummer You shoulda got David ...,"[, awww, thats, a, bummer, you, shoulda, got, ...","[awww, thats, a, bummer, you, shoulda, got, da...","[awww, thats, bummer, shoulda, got, david, car...","[awww, that, bummer, shoulda, got, david, carr..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...,"[is, upset, that, he, cant, update, his, faceb...","[is, upset, that, he, cant, update, his, faceb...","[upset, cant, update, facebook, texting, might...","[upset, cant, updat, facebook, text, might, cr..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...,I dived many times for the ball Managed to sa...,"[, i, dived, many, times, for, the, ball, mana...","[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...","[dive, mani, time, ball, manag, save, rest, go..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]","[whole, bodi, feel, itchi, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...",no its not behaving at all im mad why am i he...,"[, no, its, not, behaving, at, all, im, mad, w...","[no, its, not, behaving, at, all, im, mad, why...","[behaving, im, mad, cant, see]","[behav, im, mad, cant, see]"


In [13]:
#extract to numpy arrays

y = df.target.to_numpy()
X = df.rejoined.to_numpy()

In [14]:
#split into train (80%) and test (20%)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_t = X[train_index], X[test_index]
    y_train, y_t = y[train_index], y[test_index]

TRAIN: [ 125577   97852 1405317 ...  384833 1256796  886573] TEST: [ 973223 1219492 1199777 ...  981249 1390133  169748]


In [15]:
#split test into test (10%) and val (10%)

sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.5, train_size=0.5)

for test_i, val_i in sss_val.split(X_t, y_t):
    print("TEST:", test_i, "Val:", val_i)
    X_test, X_val = X_t[test_i], X_t[val_i]
    y_test, y_val = y_t[test_i], y_t[val_i]

TEST: [ 68126 127283  13240 ... 175548 304617 290349] Val: [ 87327 132810 267834 ...  87109 189623 270457]


In [16]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape) #check if split sizes are correct

(1280000,) (1280000,) (160000,) (160000,) (160000,) (160000,)


In [17]:
#vectorize the tweets

cv = CountVectorizer(binary=True)
cv.fit(X_train) #fit the vectorizer

X_train_vectorized = cv.transform(X_train)
X_test_vectorized = cv.transform(X_test)
X_val_vectorized = cv.transform(X_val)

In [18]:
y_pred = MultinomialNB().fit(X_train_vectorized, y_train).predict(X_val_vectorized)
print("Number of mislabeled points out of a total %d points : %d"
       % (X_val_vectorized.shape[0], (y_val != y_pred).sum()))

Number of mislabeled points out of a total 160000 points : 37394


In [19]:
mnb = MultinomialNB().fit(X_train_vectorized, y_train) #final trained model
mnb_predictions = mnb.predict(X_val_vectorized)
mnb_accuracy = mnb.score(X_val_vectorized, y_val)
print(mnb_accuracy)

0.7662875


In [20]:
print(metrics.classification_report(y_val, mnb_predictions, digits=3, target_names=['negative sentiment', 'positive sentiment']))

                    precision    recall  f1-score   support

negative sentiment      0.756     0.786     0.771     80000
positive sentiment      0.777     0.747     0.762     80000

         micro avg      0.766     0.766     0.766    160000
         macro avg      0.767     0.766     0.766    160000
      weighted avg      0.767     0.766     0.766    160000



In [53]:
#save trained model as pickle

# filename = 'MNB_sentiment_classifier_model_lemmatized.sav'
# with open(filename, 'wb') as f_out:
#     pickle.dump((mnb, cv), f_out)

In [None]:
#load the model from disk
filename = 'MNB_sentiment_classifier_model_lemmatized.sav'

loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

### Testing the model on provided tweet dataset

In [21]:
#take a subset of 30 tweets

jsonlist = []
c = 0
with open('geotagged_tweets_20160812-0912.jsons', 'r', encoding='utf-8') as f:
    for line in f:
        c+=1
        jsonlist.append(json.loads(line))
        if c == 30:
            break

In [31]:
#clean the subset of tweets

testframe = pd.DataFrame(test_input, columns=['text'])
testframe['no_urls'] = testframe['text'].apply(lambda x: remove_urls(x)) #remove urls
testframe['unmentioned'] = testframe['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
testframe['depunctualized'] = testframe['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
testframe['tokenized'] = testframe['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
testframe['improved_tokenized'] = testframe['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
testframe['nonstop'] = testframe['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
testframe['stemmed'] = testframe['nonstop'].apply(lambda x: stemming(x))
testframe['rejoined'] = testframe['stemmed'].apply(lambda x: " ".join(x))
testframe.head()

Unnamed: 0,text,no_urls,unmentioned,depunctualized,tokenized,improved_tokenized,nonstop,stemmed,rejoined
0,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,@theblaze @realDonaldTrump,,,"[, ]",[],[],[],
1,@BarackObama @FBI@LORETTALYNCH ALL IN COLLUSIO...,@BarackObama @FBI@LORETTALYNCH ALL IN COLLUSIO...,@LORETTALYNCH ALL IN COLLUSION TOGETHER #NOJUS...,LORETTALYNCH ALL IN COLLUSION TOGETHER NOJUSTI...,"[lorettalynch, all, in, collusion, together, n...","[lorettalynch, all, in, collusion, together, n...","[lorettalynch, collusion, together, nojustice,...","[lorettalynch, collus, togeth, nojustic, trump...",lorettalynch collus togeth nojustic trumppenc
2,@theblaze @realDonaldTrump https://t.co/n050DB...,@theblaze @realDonaldTrump,,,"[, ]",[],[],[],
3,@HillaryClinton he will do in one year all the...,@HillaryClinton he will do in one year all the...,he will do in one year all the things you sho...,he will do in one year all the things you sho...,"[, he, will, do, in, one, year, all, the, thin...","[he, will, do, in, one, year, all, the, things...","[one, year, things, done, eight]","[one, year, thing, done, eight]",one year thing done eight
4,#CNN #newday clear #Trump deliberately throwin...,#CNN #newday clear #Trump deliberately throwin...,#CNN #newday clear #Trump deliberately throwin...,CNN newday clear Trump deliberately throwing t...,"[cnn, newday, clear, trump, deliberately, thro...","[cnn, newday, clear, trump, deliberately, thro...","[cnn, newday, clear, trump, deliberately, thro...","[cnn, newday, clear, trump, deliber, throw, ra...",cnn newday clear trump deliber throw racein kn...


In [44]:
testinput = [tweet for tweet in testframe.rejoined.values if tweet] #drop empty tweets
testinput

['lorettalynch collus togeth nojustic trumppenc',
 'one year thing done eight',
 'cnn newday clear trump deliber throw racein knew isi destabil mideast start wiraq invas',
 'wouldnt recogn lie came mouth continu nevertrump',
 'trump trumppenc makeamericagreatagain',
 'kid know su someon that beauti thing human could anoth human',
 'cofound isi crook evil lie witch live',
 'want comparison tri maim vet pre amp post iraq pullout bar graph',
 'total concur elect cra cra n corrupt gov mind blow trump last hope',
 'issu idiot claim found isi trump go hell lie amp steal shame',
 'cant stand ortak look windont settl teamgov youin',
 'isnt rape alleg get attent caus seem probabl',
 'stole white hous furnitur',
 'gop plead w trump control behavior week want year terrifi nevertrump crazi',
 'isi cofound hillari clinton obama also devil hillari sit left hand devil',
 'come jesu meet earth suppos',
 'hanniti think disbar ignor mr hamburg dishonest',
 'stop worri msm lie focu econampimmampdfn ur sp

In [46]:
#vectorize with the fitted vectorizer

test_input_vectorized = cv.transform(testinput)

In [47]:
test = mnb.predict(test_input_vectorized) #predict the sentiment of the tweets

In [62]:
#sentiment and stemmed tweet

output = pd.DataFrame(testinput, test).reset_index()
output.columns = ['sentiment', 'tweet']
output

Unnamed: 0,sentiment,tweet
0,4,lorettalynch collus togeth nojustic trumppenc
1,0,one year thing done eight
2,0,cnn newday clear trump deliber throw racein kn...
3,0,wouldnt recogn lie came mouth continu nevertrump
4,4,trump trumppenc makeamericagreatagain
5,4,kid know su someon that beauti thing human cou...
6,0,cofound isi crook evil lie witch live
7,0,want comparison tri maim vet pre amp post iraq...
8,0,total concur elect cra cra n corrupt gov mind ...
9,0,issu idiot claim found isi trump go hell lie a...


In [52]:
#pre cleaned tweet

for i in testframe.text:
    print(i)

@theblaze @realDonaldTrump https://t.co/TY9DlZ584c
@BarackObama @FBI@LORETTALYNCH ALL IN COLLUSION TOGETHER #NOJUSTICE @realDonaldTrump #TrumpPence https://t.co/5GMNZq40V3
@theblaze @realDonaldTrump https://t.co/n050DBSpv0
@HillaryClinton he will do in one year all the things you should have done in eight
#CNN #newday clear #Trump deliberately throwing this race,in 2007 he knew that #ISIS and destabilization of Mideast started w/Iraq invasion
@realDonaldTrump, you wouldn't recognize a lie if it came from your own mouth, and they do continually. #NeverTrump https://t.co/pKSQM8yikm
#Trump2016 #TrumpPence16 #MakeAmericaGreatAgain  https://t.co/l5UsYANVc9
"Kid, you know, suing someone? Thats the most beautiful thing 1 human being could do to another human being" @funnyordie @realDonaldTrump😂💩s
@HillaryClinton you ARE the co-founder of ISIS, you crooked, evil, lying, witch. How can you live with yourself?
@Geraldanthro @NeilTurner_ @realDonaldTrump want to do a comparison try maimed Vets pr