# Train a classifier on sentiment140

In [8]:
import pandas as pd
import numpy as np

#from: http://help.sentiment140.com/for-students/

'''
0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query (lyx). If there is no query, then this value is NO_QUERY.
4 - the user that tweeted (robotickilldozr)
5 - the text of the tweet (Lyx is cool)
'''

s140_train = pd.read_csv('model_data/training.1600000.processed.noemoticon.csv', encoding='latin-1',
                         names=['sentiment','id','date','query','user','text'], header = None)

s140_train = s140_train[['sentiment','text']]

s140_train = s140_train[s140_train['sentiment'] != 2]



In [11]:
X_train = s140_train['text']
y_train = np.where(s140_train['sentiment'] == 0, 0, 1)

In [14]:
y_train.shape, X_train.shape

((1600000,), (1600000,))

In [15]:
X_train[:20]

0     @switchfoot http://twitpic.com/2y1zl - Awww, t...
1     is upset that he can't update his Facebook by ...
2     @Kenichan I dived many times for the ball. Man...
3       my whole body feels itchy and like its on fire 
4     @nationwideclass no, it's not behaving at all....
5                         @Kwesidei not the whole crew 
6                                           Need a hug 
7     @LOLTrish hey  long time no see! Yes.. Rains a...
8                  @Tatiana_K nope they didn't have it 
9                             @twittera que me muera ? 
10          spring break in plain city... it's snowing 
11                           I just re-pierced my ears 
12    @caregiving I couldn't bear to watch it.  And ...
13    @octolinz16 It it counts, idk why I did either...
14    @smarrison i would've been the first, but i di...
15    @iamjazzyfizzle I wish I got to watch it with ...
16    Hollis' death scene will hurt me severely to w...
17                                 about to file

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stopwords_nltk = set(stopwords.words("english"))
relevant_words = set(['not', 'nor', 'no', 'wasn', 'ain', 'aren', 'very', 'only', 'but', 'don', 'isn', 'weren'])
stopwords_filtered = list(stopwords_nltk.difference(relevant_words))

vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
                                    stop_words = stopwords_filtered, max_features = 10000, ngram_range = (1,2))

words_matrix = vectorizer.fit_transform(X_train)
vocabulary = vectorizer.get_feature_names()


In [20]:
vocabulary[:20]

['00',
 '000',
 '02',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '10 30',
 '10 days',
 '10 hours',
 '10 mins',
 '10 minutes',
 '10 years',
 '100',
 '100 followers',
 '1000',
 '100th']

In [22]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression() 
logistic_model.fit(words_matrix, y_train)
vocabulary = vectorizer.get_feature_names()
coefs = logistic_model.coef_
word_importances = pd.DataFrame({'word': vocabulary, 'coef': coefs.tolist()[0]})
word_importances_sorted = word_importances.sort_values(by='coef', ascending = False)
print(word_importances_sorted)

          coef             word
6048  3.595189       no problem
6167  3.377626          not sad
6065  3.253633       no worries
6205  3.252938    nothing wrong
4413  3.216784          isn bad
7395  3.000354          sad sad
6047  2.944481          no prob
9422  2.921266         wasn bad
6105  2.845657          not bad
2296  2.795189         don miss
6049  2.655038         no probs
9641  2.588553        wish luck
6101  2.296067        not alone
7841  2.294130          smiling
6039  2.267053          no need
1740  2.216351  congratulations
1390  2.203413      cannot wait
1404  2.065235        cant wait
9507  2.006067          welcome
6020  1.962237         no doubt
6054  1.959802        no school
5282  1.933175         made day
3837  1.915842        hate hate
3332  1.911487          go girl
6732  1.901858         pleasure
4054  1.879394          honored
8518  1.869588            thank
2281  1.864983       don forget
9874  1.859214            yayyy
5742  1.834959      musicmonday
...     

In [24]:
from sklearn.externals import joblib
joblib.dump(logistic_model, 'logistic_model.pkl') 

['vectorizer.pkl']

In [None]:
joblib.dump(vectorizer, 'vectorizer.pkl')