In [34]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
path ='../Datasets/rawData/'
training_data = pd.read_csv(path + 'train.csv'.format(1))
training_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [36]:
X = training_data.comment_text[:50000]
y = training_data.toxic[:50000]
print(X.shape)
print(y.shape)

(50000,)
(50000,)


In [54]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(37500,)
(12500,)
(37500,)
(12500,)


In [55]:
X_test.head()

26247            Yes that was it. I've just found out that
35067    Bob Hope \nI can't imagine your reasons. Its c...
34590    "\n\nDid you know? was updated. On 23 May, 200...
16668    Thanks, Alex. If you think you can come up wit...
12196    "== Expand ==\nCould somebody with adequate kn...
Name: comment_text, dtype: object

## Vectorize the text data

In [38]:
# instantiate the vectorizer with default parameters
vect = CountVectorizer()

In [39]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [40]:
# examine the document-term matrix
X_train_dtm

<37500x79219 sparse matrix of type '<class 'numpy.int64'>'
	with 1639887 stored elements in Compressed Sparse Row format>

In [41]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<12500x79219 sparse matrix of type '<class 'numpy.int64'>'
	with 519720 stored elements in Compressed Sparse Row format>

## Building Naive Baye's Model

In [42]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [43]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 25.3 ms, sys: 3.33 ms, total: 28.6 ms
Wall time: 28 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [44]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [45]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.94592

In [46]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[11125,   123],
       [  553,   699]])

In [47]:
# print message text for the false positives
X_test[y_test < y_pred_class]

33694    yo yo wats up wat cha doin yawljames cook was ...
1613                      I smell like octopus poo and wee
22568    Eurm\nI thought you were sick! How did you sud...
16365                    Hi girlie, you miss the turd doc?
32967    OK, I've proven his notability, go check it ou...
6280     " \n\nGet a life3 you faggit 4 lyfe. I asked u...
48530    I can only pray you're less of a judgmental hu...
13518                     I am in utter disgust right now.
38259    "\nThat's my darling Keeper. He grins like a b...
9312     Yo\n\nYo man, you'll pick things up...read gui...
1325     Uh oh, somebodies got internet muscles.99.235....
37087    Master Bates \n\nMaster Bates, Master Bates, s...
14097                 Death\n\nMan I'd kill for a cookie..
1827     Red links\nThe following red links have been e...
44540    "\n\n A what?!?!?! \n\n""Crowley is a vegetari...
48807    Dick Scanlan 4 lyfe \n\nAren't you a little yo...
12314    Shame! HAHAHAHA! Yeah, i do. Whoopsie, that's .

In [50]:
# print message text for the false negatives
X_test[y_test > y_pred_class]

24290    Vandalism is when you come along and delete va...
15073    I have never once seen a tangible benefit from...
14716    "\n\n==SUSPICIOUS MASS EDITS-IS ANY ADMINISTRA...
11751    Wow. A snide rude response from Xeworlebi. The...
23082    You've got to be kidding me! \n\nPlease stop p...
43113    I am also issuing YOU a citation for uncivil c...
29703    Weak \n\nI sense much weakness. If there were ...
19074    Once\nThe song is due to chart at #1 in the UK...
14962    "\nI feel bad for you actually.  You sit aroun...
7348     You know what, this is bullshit.  Tried to do ...
7017     "Propol]] drew first\nblood . Well, It gets pe...
39590    the book is dumb \n\nthis book is so dumb how ...
2015     I know that's not mature. And anyway, that was...
10903    Playing god again==\nBut time the deletion naz...
40318    "\n\nYou're an idiot per usual, but I'll oblig...
722      Don't peddle your crap please.  Hate to see an...
42216    You are insane. What is wrong with writing an .

In [51]:
# example false negative
X_test[3365]

"and for the capitalize thing actually its not bad grammer if your using it in titles... You need a lesson from your Teacher Matt Stiker... Lol J/k ... But Seriously though if me calling other people gay offends you... you have to be a total nerd in real life that can't take being called names... oh well i feel sorry for you... wikipedia is probably the most action you get besides your creeper sites you visit..."

In [52]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([2.59872554e-04, 2.00115442e-04, 1.05064763e-19, ...,
       1.29987859e-27, 3.54429225e-11, 2.56761244e-04])

In [53]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.8896817013120403