# Negative words model

In [1]:
import string
import re
import json
import pickle
import pandas as pd
import pymongo
from pymongo import MongoClient
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [2]:
# Setup client
client = MongoClient("localhost", 27017)
db = client.TwitterData
collection = db.Tweets

In [3]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def tokenization(text):
    text = re.split('\W+', text)
    return text

def remove_urls(text):
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text)
    return text

def remove_mentions(text):
    text = re.sub(r'(^|[^@\w])@(\w{1,15})\b', '', text)
    return text

def remove_hashtags(text):
    text = re.sub(r'(^|[^#\w])#(\w{1,50})\b', '', text);
    return text

def remove_empty_tokens(text):
    text = list(filter(None, text))
    return text

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

def lemmatize(text):
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word, pos="v") for word in text]
    return text

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

## Prepare training set

In [4]:
query = {}
projection = {"text": 1, "_id": 0}

# Get data from Mongo to Pandas
cursor = collection.find(query, projection)
df =  pd.DataFrame(list(cursor))
len(df)

657307

In [5]:
train_subset = df.sample(n=100000)

In [6]:
train_set = train_subset.copy()
train_set['no_urls'] = train_set['text'].apply(lambda x: remove_urls(x)) #remove urls
train_set['unmentioned'] = train_set['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
train_set['unhashtagged'] = train_set['unmentioned'].apply(lambda x: remove_hashtags(x)) #remove mentions
train_set['depunctualized'] = train_set['unhashtagged'].apply(lambda x: remove_punct(x)) #remove punctuations
train_set['tokenized'] = train_set['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
train_set['improved_tokenized'] = train_set['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
train_set['nonstop'] = train_set['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
train_set['lemmatized'] = train_set['nonstop'].apply(lambda x: lemmatize(x)) #stem the tokens
train_set['rejoined'] = train_set['lemmatized'].apply(lambda x: " ".join(x)) #rejoin for vectorization
train_set.head()

Unnamed: 0,text,no_urls,unmentioned,unhashtagged,depunctualized,tokenized,improved_tokenized,nonstop,lemmatized,rejoined
49193,@FoxNews @billclinton @HillaryClinton The Clin...,@FoxNews @billclinton @HillaryClinton The Clin...,The Clintons already have Trumps tax return v...,The Clintons already have Trumps tax return v...,The Clintons already have Trumps tax return v...,"[, the, clintons, already, have, trumps, tax, ...","[the, clintons, already, have, trumps, tax, re...","[clintons, already, trumps, tax, return, via, ...","[clintons, already, trump, tax, return, via, c...",clintons already trump tax return via corrupt irs
130662,So he'll leave after #HillaryClinton wins in N...,So he'll leave after #HillaryClinton wins in N...,So he'll leave after #HillaryClinton wins in N...,So he'll leave after wins in November?,So hell leave after wins in November,"[so, hell, leave, after, wins, in, november, ]","[so, hell, leave, after, wins, in, november]","[hell, leave, wins, november]","[hell, leave, win, november]",hell leave win november
595232,@HalleyBorderCol @smp0711 @FoxBusiness @LouDob...,@HalleyBorderCol @smp0711 @FoxBusiness @LouDob...,Because a Communist supports Hillary does tha...,Because a Communist supports Hillary does tha...,Because a Communist supports Hillary does tha...,"[, because, a, communist, supports, hillary, d...","[because, a, communist, supports, hillary, doe...","[communist, supports, hillary, make, communist]","[communist, support, hillary, make, communist]",communist support hillary make communist
417267,@HillaryClinton tried to donate many times ove...,@HillaryClinton tried to donate many times ove...,tried to donate many times over the last 2 m ...,tried to donate many times over the last 2 m ...,tried to donate many times over the last m l...,"[, tried, to, donate, many, times, over, the, ...","[tried, to, donate, many, times, over, the, la...","[tried, donate, many, times, last, link, n, em...","[try, donate, many, time, last, link, n, email...",try donate many time last link n email frm rob...
611006,"@MikePenceVP You just left out #DavidDuke,#Ste...","@MikePenceVP You just left out #DavidDuke,#Ste...","You just left out #DavidDuke,#SteveBannon,#Al...",You just left out &amp; in your speech !,You just left out amp in your speech,"[, you, just, left, out, amp, in, your, speech, ]","[you, just, left, out, amp, in, your, speech]","[left, amp, speech]","[leave, amp, speech]",leave amp speech


In [7]:
train_set['rejoined'].replace('', np.nan, inplace=True)
train_set.dropna(inplace=True)

In [8]:
## Adding negative words
negative_words = pd.read_csv("data/negative_words.txt", sep="\n", header=None, names=["word"])
negative_words = negative_words.word.values.tolist()

def has_negative_word(text):
    for word in text.split():
        if word in negative_words:
            return True
    return False

In [9]:
train_set["has_negative_word"] = train_set["rejoined"].apply(lambda x: has_negative_word(x))

In [10]:
## Prepare input
train_set_input = train_set.iloc[:, [-1, -2]]
train_set_input.columns = ["target", "rejoined"]
train_set_input["target"] = train_set_input.target.apply(lambda x: 0 if x is True else 4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set_input["target"] = train_set_input.target.apply(lambda x: 0 if x is True else 4)


## Training the model

In [11]:
y = train_set_input.target.to_numpy()
X = train_set_input.rejoined.to_numpy()

In [12]:
# Split into train (80%) and test (20%)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_t = X[train_index], X[test_index]
    y_train, y_t = y[train_index], y[test_index]

TRAIN: [49692 53227 40459 ... 71448  8557 56422] TEST: [68896 82020 23674 ...  4796  1184  5547]


In [13]:
#split test into test (10%) and val (10%)

sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.5, train_size=0.5)

for test_i, val_i in sss_val.split(X_t, y_t):
    print("TEST:", test_i, "Val:", val_i)
    X_test, X_val = X_t[test_i], X_t[val_i]
    y_test, y_val = y_t[test_i], y_t[val_i]

TEST: [ 9329 10580  7642 ... 12820 14986 17292] Val: [ 9766  2829 10343 ...  8624  5694  1646]


In [14]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape) #check if split sizes are correct

(71592,) (71592,) (8949,) (8949,) (8950,) (8950,)


In [15]:
#vectorize the tweets

cv = CountVectorizer(binary=True, min_df=2)
cv.fit(X_train) #fit the vectorizer

X_train_vectorized = cv.transform(X_train)
X_test_vectorized = cv.transform(X_test)
X_val_vectorized = cv.transform(X_val)


In [16]:
y_pred = MultinomialNB().fit(X_train_vectorized, y_train).predict(X_val_vectorized)
print("Number of mislabeled points out of a total %d points : %d"
       % (X_val_vectorized.shape[0], (y_val != y_pred).sum()))

Number of mislabeled points out of a total 8950 points : 573


In [17]:
mnb = MultinomialNB().fit(X_train_vectorized, y_train) #final trained model
mnb_predictions = mnb.predict(X_val_vectorized)
mnb_accuracy = mnb.score(X_val_vectorized, y_val)
print(mnb_accuracy)

0.9359776536312849


In [18]:
print(metrics.classification_report(y_val, mnb_predictions, digits=3, target_names=['negative sentiment', 'positive sentiment']))

                    precision    recall  f1-score   support

negative sentiment      0.900     0.963     0.931      3985
positive sentiment      0.968     0.914     0.941      4965

          accuracy                          0.936      8950
         macro avg      0.934     0.939     0.936      8950
      weighted avg      0.938     0.936     0.936      8950



In [19]:
#save trained model as pickle

#filename = 'neg_word_model.sav'
#with open(filename, 'wb') as f_out:
#    pickle.dump((mnb, cv), f_out)

## Testing on Sentiment140 dataset

In [20]:
# 0 = negative, 4 = positive

df_testframe = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])
df_testframe.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [21]:
testframe = df_testframe

testframe['no_urls'] = testframe['text'].apply(lambda x: remove_urls(x)) #remove urls
testframe['unmentioned'] = testframe['no_urls'].apply(lambda x: remove_mentions(x)) #remove mentions
testframe['unhashtagged'] = testframe['unmentioned'].apply(lambda x: remove_hashtags(x)) #remove mentions
testframe['depunctualized'] = testframe['unhashtagged'].apply(lambda x: remove_punct(x)) #remove punctuations
testframe['tokenized'] = testframe['depunctualized'].apply(lambda x: tokenization(x.lower())) #tokenize
testframe['improved_tokenized'] = testframe['tokenized'].apply(lambda x: remove_empty_tokens(x)) #remove empty tokens
testframe['nonstop'] = testframe['improved_tokenized'].apply(lambda x: remove_stopwords(x)) #remove stopwords
testframe['lemmatized'] = testframe['nonstop'].apply(lambda x: lemmatize(x)) #stem the tokens
testframe['rejoined_lem'] = testframe['lemmatized'].apply(lambda x: " ".join(x)) #rejoin for vectorization
testframe['rejoined'] = testframe['nonstop'].apply(lambda x: " ".join(x)) #rejoin for vectorization
testframe.head()

Unnamed: 0,target,id,date,flag,user,text,no_urls,unmentioned,unhashtagged,depunctualized,tokenized,improved_tokenized,nonstop,lemmatized,rejoined_lem,rejoined
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","@switchfoot - Awww, that's a bummer. You sho...","- Awww, that's a bummer. You shoulda got Da...","- Awww, that's a bummer. You shoulda got Da...",Awww thats a bummer You shoulda got David ...,"[, awww, thats, a, bummer, you, shoulda, got, ...","[awww, thats, a, bummer, you, shoulda, got, da...","[awww, thats, bummer, shoulda, got, david, car...","[awww, thats, bummer, shoulda, get, david, car...",awww thats bummer shoulda get david carr third...,awww thats bummer shoulda got david carr third...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...,"[is, upset, that, he, cant, update, his, faceb...","[is, upset, that, he, cant, update, his, faceb...","[upset, cant, update, facebook, texting, might...","[upset, cant, update, facebook, texting, might...",upset cant update facebook texting might cry r...,upset cant update facebook texting might cry r...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...,I dived many times for the ball. Managed to s...,I dived many times for the ball Managed to sa...,"[, i, dived, many, times, for, the, ball, mana...","[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...","[dive, many, time, ball, manage, save, rest, g...",dive many time ball manage save rest go bound,dived many times ball managed save rest go bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]","[whole, body, feel, itchy, like, fire]",whole body feel itchy like fire,whole body feels itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...","no, it's not behaving at all. i'm mad. why am...",no its not behaving at all im mad why am i he...,"[, no, its, not, behaving, at, all, im, mad, w...","[no, its, not, behaving, at, all, im, mad, why...","[behaving, im, mad, cant, see]","[behave, im, mad, cant, see]",behave im mad cant see,behaving im mad cant see


In [22]:
t_frame = testframe[["target", "rejoined_lem"]]
testinput = [tweet[1] for tweet in t_frame.values if tweet[1]]
testoutput = [tweet[0] for tweet in t_frame.values if tweet[1]]

In [23]:
## Loading model
model = pickle.load(open("test_lem.sav", 'rb'))
mnb = model[0]
cv = model[1]

In [24]:
test_input_vectorized = cv.transform(testinput) #vectorize with the fitted vectorizer

In [25]:
accuracy = mnb.score(test_input_vectorized, testoutput)

In [26]:
accuracy

0.6484905376047017