### Training the model

In [2]:
import string
import re
import json
import pickle
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [None]:
#if heavier emoji stripper is needed

# def remove_emoji(string):
#     emoji_pattern = re.compile("["
#                                u"\U0001F600-\U0001F64F"  # emoticons
#                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                                u"\U00002500-\U00002BEF"  # chinese char
#                                u"\U00002702-\U000027B0"
#                                u"\U00002702-\U000027B0"
#                                u"\U000024C2-\U0001F251"
#                                u"\U0001f926-\U0001f937"
#                                u"\U00010000-\U0010ffff"
#                                u"\u2640-\u2642"
#                                u"\u2600-\u2B55"
#                                u"\u200d"
#                                u"\u23cf"
#                                u"\u23e9"
#                                u"\u231a"
#                                u"\ufe0f"  # dingbats
#                                u"\u3030"
#                                "]+", flags=re.UNICODE)
#     return emoji_pattern.sub(r'', string)

In [3]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def tokenization(text):
    text = re.split('\W+', text)
    return text

def remove_urls(text):
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text)
    return text

def remove_mentions(text):
    text = re.sub(r'(^|[^@\w])@(\w{1,15})\b', '', text)
    return text

def remove_empty_tokens(text):
    text = list(filter(None, text))
    return text

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [5]:
# 0 = negative, 4 = positive

df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
df['no_urls'] = df['text'].apply(lambda x: remove_urls(x)) #remove urls
df['unmentioned'] = df['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
df['depunctualized'] = df['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
df['tokenized'] = df['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
df['improved_tokenized'] = df['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
df['nonstop'] = df['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
df['stemmed'] = df['nonstop'].apply(lambda x: stemming(x)) #stem the tokens
df['rejoined'] = df['stemmed'].apply(lambda x: " ".join(x)) #rejoin for vectorization
df.head()

Unnamed: 0,target,id,date,flag,user,text,no_urls,unmentioned,depunctualized,tokenized,improved_tokenized,nonstop,stemmed,rejoined
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","@switchfoot - Awww, that's a bummer. You sho...","- Awww, that's a bummer. You shoulda got Da...",Awww thats a bummer You shoulda got David ...,"[, awww, thats, a, bummer, you, shoulda, got, ...","[awww, thats, a, bummer, you, shoulda, got, da...","[awww, thats, bummer, shoulda, got, david, car...","[awww, that, bummer, shoulda, got, david, carr...",awww that bummer shoulda got david carr third day
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...,"[is, upset, that, he, cant, update, his, faceb...","[is, upset, that, he, cant, update, his, faceb...","[upset, cant, update, facebook, texting, might...","[upset, cant, updat, facebook, text, might, cr...",upset cant updat facebook text might cri resul...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...,I dived many times for the ball Managed to sa...,"[, i, dived, many, times, for, the, ball, mana...","[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...","[dive, mani, time, ball, manag, save, rest, go...",dive mani time ball manag save rest go bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]","[whole, bodi, feel, itchi, like, fire]",whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...",no its not behaving at all im mad why am i he...,"[, no, its, not, behaving, at, all, im, mad, w...","[no, its, not, behaving, at, all, im, mad, why...","[behaving, im, mad, cant, see]","[behav, im, mad, cant, see]",behav im mad cant see


In [66]:
#extract to numpy arrays

y = df.target.to_numpy()
X = df.rejoined.to_numpy()

In [67]:
#split into train (80%) and test (20%)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_t = X[train_index], X[test_index]
    y_train, y_t = y[train_index], y[test_index]

TRAIN: [ 676295 1080293   77706 ...    9768  481696 1177386] TEST: [ 475733  253127 1507988 ... 1139612 1488894 1556879]


In [68]:
#split test into test (10%) and val (10%)

sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.5, train_size=0.5)

for test_i, val_i in sss_val.split(X_t, y_t):
    print("TEST:", test_i, "Val:", val_i)
    X_test, X_val = X_t[test_i], X_t[val_i]
    y_test, y_val = y_t[test_i], y_t[val_i]

TEST: [ 74429  54573 254983 ... 252588  38747 171811] Val: [129815 234404 135075 ... 200157 157826  52091]


In [69]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape) #check if split sizes are correct

(1280000,) (1280000,) (160000,) (160000,) (160000,) (160000,)


In [70]:
#vectorize the tweets

cv = CountVectorizer(binary=True)
cv.fit(X_train) #fit the vectorizer

X_train_vectorized = cv.transform(X_train)
X_test_vectorized = cv.transform(X_test)
X_val_vectorized = cv.transform(X_val)

In [71]:
y_pred = MultinomialNB().fit(X_train_vectorized, y_train).predict(X_val_vectorized)
print("Number of mislabeled points out of a total %d points : %d"
       % (X_val_vectorized.shape[0], (y_val != y_pred).sum()))

Number of mislabeled points out of a total 160000 points : 37082


In [72]:
mnb = MultinomialNB().fit(X_train_vectorized, y_train) #final trained model
mnb_predictions = mnb.predict(X_val_vectorized)
mnb_accuracy = mnb.score(X_val_vectorized, y_val)
print(mnb_accuracy)

0.7682375


In [73]:
print(metrics.classification_report(y_val, mnb_predictions, digits=3, target_names=['negative sentiment', 'positive sentiment']))

                    precision    recall  f1-score   support

negative sentiment      0.758     0.788     0.773     80000
positive sentiment      0.780     0.748     0.763     80000

          accuracy                          0.768    160000
         macro avg      0.769     0.768     0.768    160000
      weighted avg      0.769     0.768     0.768    160000



In [74]:
#save trained model as pickle

# filename = 'MNB_sentiment_classifier_model_lemmatized.sav'
# with open(filename, 'wb') as f_out:
#     pickle.dump((mnb, cv), f_out)

In [77]:
#load the model from disk
filename = 'MNB_sentiment_classifier_model_lemmatized.sav'

loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

AttributeError: 'tuple' object has no attribute 'score'

In [78]:
#load the model from disk
filename = 'MNB_sentiment_classifier_model_lemmatized.sav'

mnb, cv = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

mnb_accuracy = mnb.score(X_cal_vectorized, y_val)
print(mnb_accuracy)

NameError: name 'X_cal_vectorized' is not defined

### Testing the model on provided tweet dataset

In [35]:
#take a subset of 30 tweets

jsonlist = []
c = 0
with open('geotagged_tweets_20160812-0912.jsons', 'r', encoding='utf-8') as f:
    for line in f:
        c+=1
        jsonlist.append(json.loads(line))
        if c == 30:
            break

In [45]:
def main():
    tweet_file = ['geotagged_tweets_20160812-0912.json']
    tweets = []
    for file in tweet_files:
        with open(file, 'r')as f:
            for line in f.readlines():
                tweets.append(json.loads(line))
    populate_tweet_df(tweets)

In [56]:
def populate_tweet_df(tweets):
    df=pd.DataFrame()
    
    df['created_at']= df['created_at'].apply(lamda x: tweet(x))
    
    print(df)
    
 
   

SyntaxError: invalid syntax (<ipython-input-56-c138d136b316>, line 4)

In [None]:
 body={"source": dict_data["source"],
                           "created_at": dict_data["user"]["created_at"],
                           "author": dict_data["user"]["screen_name"],
                           "location": dict_data["user"]["location"],
                           "time_zone": dict_data["user"]["time_zone"],
                           "followers": dict_data["user"]["followers_count"],
                           "friends": dict_data["user"]["friends_count"],
                           "timestamp": timestamp,
                           "message": dict_data["text"],
                           "polarity": tweet.sentiment.polarity,
                           "subjectivity": tweet.sentiment.subjectivity,
                           "sentiment": sentiment})
    
    
     dict_data = json.loads(data)

In [36]:
#clean the subset of tweets

testframe = pd.DataFrame(test_input, columns=['text'])
testframe['no_urls'] = testframe['text'].apply(lambda x: remove_urls(x)) #remove urls
testframe['unmentioned'] = testframe['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
testframe['depunctualized'] = testframe['unmentioned'].apply(lambda x: remove_punct(x)) #remove punctuations
testframe['tokenized'] = testframe['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
testframe['improved_tokenized'] = testframe['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
testframe['nonstop'] = testframe['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
testframe['stemmed'] = testframe['nonstop'].apply(lambda x: stemming(x))
testframe['rejoined'] = testframe['stemmed'].apply(lambda x: " ".join(x))
testframe.head()

NameError: name 'test_input' is not defined

In [22]:
testinput = [tweet for tweet in testframe.rejoined.values if tweet] #drop empty tweets
testinput

NameError: name 'testframe' is not defined

In [24]:
#vectorize with the fitted vectorizer

test_input_vectorized = cv.transform(testinput)

NameError: name 'testinput' is not defined

In [25]:
test = mnb.predict(test_input_vectorized) #predict the sentiment of the tweets

NameError: name 'test_input_vectorized' is not defined

In [None]:
#sentiment and stemmed tweet

output = pd.DataFrame(testinput, test).reset_index()
output.columns = ['sentiment', 'tweet']
output

In [None]:
#pre cleaned tweet

for i in testframe.text:
    print(i)