In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer

In [2]:
data = pd.read_csv('dataset.csv')
print(data.head())

   mood    tweet_id                      datetime     query             user  \
0     0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1     0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2     0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3     0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4     0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                               tweet  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  


In [3]:
data = data[['tweet', 'mood']]
print(data.head())

                                               tweet  mood
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...     0
1  is upset that he can't update his Facebook by ...     0
2  @Kenichan I dived many times for the ball. Man...     0
3    my whole body feels itchy and like its on fire      0
4  @nationwideclass no, it's not behaving at all....     0


In [4]:
data['sentiments'] = data.mood.map(lambda s: 0 if s == 0 else 1)
print(data.head())

                                               tweet  mood  sentiments
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...     0           0
1  is upset that he can't update his Facebook by ...     0           0
2  @Kenichan I dived many times for the ball. Man...     0           0
3    my whole body feels itchy and like its on fire      0           0
4  @nationwideclass no, it's not behaving at all....     0           0


In [5]:
data.drop('mood', axis=1, inplace=True)
print(data.head())

                                               tweet  sentiments
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...           0
1  is upset that he can't update his Facebook by ...           0
2  @Kenichan I dived many times for the ball. Man...           0
3    my whole body feels itchy and like its on fire            0
4  @nationwideclass no, it's not behaving at all....           0


In [6]:
print(data.sentiments.value_counts())

1    800000
0    800000
Name: sentiments, dtype: int64


In [7]:
data = data.sample(1600000)
print(data.head())

                                                     tweet  sentiments
1116374  @luckee13  LOL, I noticed it went to you, was ...           1
197658   @twitt3rbox first off live your app  1 questio...           0
1556029  @Quiggmate: Thank you sooo mmuchh x you have m...           1
440232   Fascinating stuff but I am getting on with som...           0
348254   Back at work  Oh well, only three more days un...           0


In [8]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [9]:
data['tweet'] = data.tweet.map(lambda x: tokenizer.tokenize(x))
print(data.head())

                                                     tweet  sentiments
1116374  [luckee, LOL, I, noticed, it, went, to, you, w...           1
197658   [twitt, rbox, first, off, live, your, app, que...           0
1556029  [Quiggmate, Thank, you, sooo, mmuchh, x, you, ...           1
440232   [Fascinating, stuff, but, I, am, getting, on, ...           0
348254   [Back, at, work, Oh, well, only, three, more, ...           0


In [11]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [12]:
data['tweet'] = data.tweet.map(lambda x: [stemmer.stem(word) for word in x])
print(data.head())

                                                     tweet  sentiments
1116374  [lucke, lol, i, notic, it, went, to, you, was,...           1
197658   [twitt, rbox, first, off, live, your, app, que...           0
1556029  [quiggmat, thank, you, sooo, mmuchh, x, you, h...           1
440232   [fascin, stuff, but, i, am, get, on, with, som...           0
348254   [back, at, work, oh, well, onli, three, more, ...           0


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')

In [14]:
data.tweet = data.tweet.str.join(sep=' ')
data

Unnamed: 0,tweet,sentiments
1116374,lucke lol i notic it went to you was pose to g...,1
197658,twitt rbox first off live your app question wh...,0
1556029,quiggmat thank you sooo mmuchh x you have made...,1
440232,fascin stuff but i am get on with some work re...,0
348254,back at work oh well onli three more day until...,0
...,...,...
469480,alexpapworth i agre char are too short i could...,0
1146387,sgogolev that s what i do too also just came b...,1
1107391,andiwalshaw the other shin album are absolut n...,1
147176,has a sick littl boy just in time for his birt...,0


In [15]:
data_tf = cv.fit_transform(data.tweet)
data_tf

<1600000x533365 sparse matrix of type '<class 'numpy.int64'>'
	with 11146262 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(data_tf, data.sentiments)

In [17]:
data.sentiments.value_counts()

1    800000
0    800000
Name: sentiments, dtype: int64

In [18]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

In [19]:
mnb.fit(x_train, y_train)

MultinomialNB()

In [1]:
print(mnb.score(x_train, y_train))

NameError: name 'mnb' is not defined

In [22]:
import pickle
pickle_o = open('mnb_model_large.pickle', 'wb')
pickle.dump(mnb, pickle_o)
pickle_o.close()

In [31]:
pickle_out = open('count_vectorizer.pickle', 'wb')
pickle.dump(cv, pickle_out)
pickle_out.close()