In [38]:
import pandas as pd

In [39]:
train_datas = pd.read_csv('new_training_filtered.csv').fillna('')

In [40]:
train_datas.columns = ['label', 'trainvals']

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
X = train_datas.trainvals
y = train_datas.label

In [121]:
vect = TfidfVectorizer(stop_words='english')

In [122]:
X_train_dtm = vect.fit_transform(X)

In [123]:
test_datas = pd.read_csv('test_tweets.csv').fillna('')

In [124]:
test_datas.columns = ['tweets', 'datetime']

In [125]:
X_test = test_datas.tweets

In [126]:
X_test_dtm = vect.transform(X_test)

In [127]:
from sklearn.naive_bayes import MultinomialNB

In [128]:
mnb = MultinomialNB()

In [129]:
mnb.fit(X_train_dtm, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [130]:
preds = mnb.predict(X_test_dtm)

In [131]:
preds

array([4, 0, 4, ..., 4, 0, 4])

In [132]:
# cross - validation technique

In [133]:
from sklearn.cross_validation import train_test_split

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [137]:
X_train_dtm = vect.fit_transform(X_train)

In [138]:
X_test_dtm = vect.transform(X_test)

In [139]:
mnb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [140]:
y_pred_class = mnb.predict(X_test_dtm)

In [141]:
# calculate accuracy of class predictions
from sklearn import metrics

In [142]:
metrics.accuracy_score(y_test, y_pred_class)

0.75065407390905647

In [143]:
y_test.value_counts()

0    97080
4    95941
Name: label, dtype: int64

In [144]:
# null accuracy
null_accuracy = y_test.value_counts().head(4) / len(y_test)
null_accuracy
# we can see that our accuracuy is greater than our null accuracy

0    0.50295
4    0.49705
Name: label, dtype: float64

In [145]:
# confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)
# Confusion matrix 
# [TN FP 
# FN TP]

array([[74826, 22254],
       [25875, 70066]])

In [146]:
# Naive Bayes counts the number of observations in each class
mnb.class_count_

array([ 291307.,  287754.])

In [147]:
# examining a model for further insight

In [148]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)

206084

In [149]:
mnb.feature_count_, mnb.feature_count_.shape

(array([[ 19.83040421,   9.31141214,   4.40243255, ...,   0.62480385,
           0.        ,   0.54764071],
        [  9.38848139,   4.75246617,   4.50377229, ...,   0.        ,
           0.5471108 ,   0.        ]]), (2, 206084))

In [150]:
zero_token_count = mnb.feature_count_[0, :]
zero_token_count

array([ 19.83040421,   9.31141214,   4.40243255, ...,   0.62480385,
         0.        ,   0.54764071])

In [151]:
four_token_count = mnb.feature_count_[1, :]
four_token_count

array([ 9.38848139,  4.75246617,  4.50377229, ...,  0.        ,
        0.5471108 ,  0.        ])

In [152]:
tokens = pd.DataFrame({
    'token': X_train_tokens,
    'zero': zero_token_count,
    'four': four_token_count
}).set_index('token')

In [153]:
tokens.head()

Unnamed: 0_level_0,four,zero
token,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,9.388481,19.830404
aaa,4.752466,9.311412
aaaa,4.503772,4.402433
aaaaa,2.164179,6.070895
aaaaaa,2.204282,4.192905


In [154]:
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,four,zero
token,Unnamed: 1_level_1,Unnamed: 2_level_1
englishtagalog,0.0,0.457875
belmont,6.06432,3.846933
riffs,0.497215,0.504904
dounloading,0.0,0.605964
ivf,0.517619,1.01059


In [155]:
# add 1 to four and zero counts to avoid dividing by 0
tokens['four'] = tokens.four + 1
tokens['zero'] = tokens.zero + 1
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,four,zero
token,Unnamed: 1_level_1,Unnamed: 2_level_1
englishtagalog,1.0,1.457875
belmont,7.06432,4.846933
riffs,1.497215,1.504904
dounloading,1.0,1.605964
ivf,1.517619,2.01059


In [156]:
# convert the zero and four counts into frequencies
tokens['zero'] = tokens.zero / mnb.class_count_[0]
tokens['four'] = tokens.four / mnb.class_count_[1]
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,four,zero
token,Unnamed: 1_level_1,Unnamed: 2_level_1
englishtagalog,3e-06,5e-06
belmont,2.5e-05,1.7e-05
riffs,5e-06,5e-06
dounloading,3e-06,6e-06
ivf,5e-06,7e-06


In [157]:
# calculate the ratio of zero-to-four for each token
tokens['zero_ratio'] = tokens.zero / tokens.four
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,four,zero,zero_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
englishtagalog,3e-06,5e-06,1.440094
belmont,2.5e-05,1.7e-05,0.677746
riffs,5e-06,5e-06,0.992876
dounloading,3e-06,6e-06,1.586376
ivf,5e-06,7e-06,1.308674


In [158]:
tokens.sort_values('zero_ratio', ascending=False)

Unnamed: 0_level_0,four,zero,zero_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fawcett,0.000003,0.000200,57.411509
inaperfectworld,0.000003,0.000132,38.063383
mcmahon,0.000004,0.000116,26.168188
farrah,0.000011,0.000281,25.113455
unloved,0.000005,0.000110,24.205436
sadd,0.000003,0.000082,23.515779
sad,0.000457,0.009960,21.771824
carradine,0.000006,0.000137,21.622820
saddened,0.000005,0.000101,21.069996
boohoo,0.000006,0.000125,19.948774


In [159]:
tokens.loc['sadd', 'zero_ratio']

23.515779348528827