In [5]:
# NLP imports
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [6]:
# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [7]:
# python imports
import re
import json
from collections import Counter
from matplotlib import pyplot as plt 

In [8]:
import pandas as pd
import numpy as np 

In [9]:
# seaborn import
import seaborn as sns

In [10]:
# preprocessor data sources and instances
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
tf = TfidfVectorizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ResearchSiann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ResearchSiann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
fname = "/Users/ResearchSiann/Downloads/VIII corpuses/trainingandtestdata/training.1600000.processed.noemoticon.csv"
df_train = pd.read_csv(fname, header=None, encoding='cp1252')

In [12]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [13]:
df_train.tail()

Unnamed: 0,0,1,2,3,4,5
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [14]:
df_train.loc[0,5] #this will give the tweet in row number zero, column number five

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [15]:
df_train.loc[0:5, 5]

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
5                        @Kwesidei not the whole crew 
Name: 5, dtype: object

In [16]:
stop_words #words that don't really have meaning or sentiment

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [17]:
#create a function
def clean(text):
    
    #tokenize
    tokenized_words = word_tokenize(text) #list
    #remove stop words
    filtered_words = list(filter(lambda x: x not in stop_words, tokenized_words))
    #remove
    words = list(map(ps.stem, filtered_words))
    #remove
    new_words = [ word for word in words if word.isalnum()]
    result = " ".join(new_words)
                 
    return result

In [18]:
import pickle

with open ("sentences_processed.pkl", "r") as pk:
    sentences_processed = json.load(pk)

In [19]:
sentences_processed[:1]

['awww that bummer you shoulda got david carr third day D']

In [23]:
sentences_processed[:3]
df_train.loc[0]
Y = df_train[0].tolist()
data = pd.DataFrame(data={'sentences': sentences_processed, 'sentiment': Y})
data.head()

Unnamed: 0,sentences,sentiment
0,awww that bummer you shoulda got david carr th...,0
1,upset cant updat facebook text might cri resul...,0
2,I dive mani time ball manag save the rest go b...,0
3,whole bodi feel itchi like fire,0
4,behav im mad I cant see,0


In [33]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
# tokenize text
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data['sentences'])
X_train_counts.shape

(1600000, 219117)

In [35]:
# save count_vect
with open("countvect.pkl", "wb") as ct:
    pickle.dump(count_vect, ct)


In [36]:
# convert occurences to frequencies
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


(1600000, 219117)

In [37]:
# save the object
with open( "tf.pkl", "wb") as ttt:
    pickle.dump(tfidf_transformer, ttt)

In [38]:
### Split data into Test-Train sets
X_train, X_test, y_train, y_test = train_test_split(
    X_train_tfidf, data['sentiment'], test_size=0.001, random_state=1)




clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_test)

print("NB Model Accuracy:", metrics.accuracy_score(y_test, predicted))


NB Model Accuracy: 0.76625


In [40]:
#save the model

import pickle
Pkl_Filename = "Pickle_Model.pkl"

with open(Pkl_Filename, 'wb') as file:
    pickle.dump(clf, file)