# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [41]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('E:\MLCourse\emails\spam', 'spam'))
data = data.append(dataFrameFromDirectory('E:\MLCourse\emails\ham', 'ham'))


Let's have a look at that DataFrame:

In [2]:
data.head()

Unnamed: 0,message,class
E:\MLCourse\emails\spam\00001.7848dde101aa985090474a91ec93fcf0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",spam
E:\MLCourse\emails\spam\00002.d94f1b97e48ed3b553b3508d116e6a09,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
E:\MLCourse\emails\spam\00003.2ee33bc6eacdb11f38d052c44819ba6c,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
E:\MLCourse\emails\spam\00004.eac8de8d759b7e74154f142194282724,##############################################...,spam
E:\MLCourse\emails\spam\00005.57696a39d7d84318ce497886896bf90d,I thought you might like these:\n\n1) Slim Dow...,spam


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [5]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB()

Let's try it out:

In [6]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'], dtype='<U4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [8]:
ex1 = ['STOP BULLYING!!' , 'How are you?']
ex1counts = vectorizer.transform(ex1)
predictions = classifier.predict(ex1counts)
predictions

array(['ham', 'ham'], dtype='<U4')

In [9]:
e2 = ['Avail this offer' , 'Get 50% discount coupens']
ex2counts = vectorizer.transform(e2)
predictions= classifier.predict(ex2counts)
predictions


array(['spam', 'spam'], dtype='<U4')

In [10]:
X = data['message']
X

E:\MLCourse\emails\spam\00001.7848dde101aa985090474a91ec93fcf0    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...
E:\MLCourse\emails\spam\00002.d94f1b97e48ed3b553b3508d116e6a09    1) Fight The Risk of Cancer!\n\nhttp://www.adc...
E:\MLCourse\emails\spam\00003.2ee33bc6eacdb11f38d052c44819ba6c    1) Fight The Risk of Cancer!\n\nhttp://www.adc...
E:\MLCourse\emails\spam\00004.eac8de8d759b7e74154f142194282724    ##############################################...
E:\MLCourse\emails\spam\00005.57696a39d7d84318ce497886896bf90d    I thought you might like these:\n\n1) Slim Dow...
                                                                                        ...                        
E:\MLCourse\emails\ham\02496.aae0c81581895acfe65323f344340856     Man killed 'trying to surf' on Tube train \n\n...
E:\MLCourse\emails\ham\02497.60497db0a06c2132ec2374b2898084d3     Hi Gianni,\n\n\n\nA very good resource for thi...
E:\MLCourse\emails\ham\02498.09835f512f156da210efb99fcc523e21     Gianni

In [11]:
Y = data['class']
Y 


E:\MLCourse\emails\spam\00001.7848dde101aa985090474a91ec93fcf0    spam
E:\MLCourse\emails\spam\00002.d94f1b97e48ed3b553b3508d116e6a09    spam
E:\MLCourse\emails\spam\00003.2ee33bc6eacdb11f38d052c44819ba6c    spam
E:\MLCourse\emails\spam\00004.eac8de8d759b7e74154f142194282724    spam
E:\MLCourse\emails\spam\00005.57696a39d7d84318ce497886896bf90d    spam
                                                                  ... 
E:\MLCourse\emails\ham\02496.aae0c81581895acfe65323f344340856      ham
E:\MLCourse\emails\ham\02497.60497db0a06c2132ec2374b2898084d3      ham
E:\MLCourse\emails\ham\02498.09835f512f156da210efb99fcc523e21      ham
E:\MLCourse\emails\ham\02499.b4af165650f138b10f9941f6cc5bce3c      ham
E:\MLCourse\emails\ham\02500.05b3496ce7bca306bed0805425ec8621      ham
Name: class, Length: 3000, dtype: object

In [42]:
data.columns


Index(['message', 'class'], dtype='object')

In [43]:
data = data.reset_index(drop = True)

In [44]:
data.columns

Index(['message', 'class'], dtype='object')

In [46]:
X = data['message']

In [47]:
X

0       <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...
1       1) Fight The Risk of Cancer!\n\nhttp://www.adc...
2       1) Fight The Risk of Cancer!\n\nhttp://www.adc...
3       ##############################################...
4       I thought you might like these:\n\n1) Slim Dow...
                              ...                        
2995    Man killed 'trying to surf' on Tube train \n\n...
2996    Hi Gianni,\n\n\n\nA very good resource for thi...
2997    Gianni Ponzi wrote:\n\n> I have a prob when tr...
2998    Neale Pickett <neale@woozle.org> writes:\n\n\n...
2999    \n\nHi,\n\n\n\nI think you need to give us a l...
Name: message, Length: 3000, dtype: object

In [48]:
Y = data['class']

In [49]:
Y

0       spam
1       spam
2       spam
3       spam
4       spam
        ... 
2995     ham
2996     ham
2997     ham
2998     ham
2999     ham
Name: class, Length: 3000, dtype: object

In [50]:
from sklearn.model_selection import train_test_split

In [52]:
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size = 0.2)

In [55]:
X_train.shape

(2400,)

In [57]:
data_tuple = numpy.array(data)

In [58]:
data_tuple

array([['<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n\n<HTML><HEAD>\n\n<META content=3D"text/html; charset=3Dwindows-1252" http-equiv=3DContent-T=\n\nype>\n\n<META content=3D"MSHTML 5.00.2314.1000" name=3DGENERATOR></HEAD>\n\n<BODY><!-- Inserted by Calypso -->\n\n<TABLE border=3D0 cellPadding=3D0 cellSpacing=3D2 id=3D_CalyPrintHeader_ r=\n\nules=3Dnone \n\nstyle=3D"COLOR: black; DISPLAY: none" width=3D"100%">\n\n  <TBODY>\n\n  <TR>\n\n    <TD colSpan=3D3>\n\n      <HR color=3Dblack noShade SIZE=3D1>\n\n    </TD></TR></TD></TR>\n\n  <TR>\n\n    <TD colSpan=3D3>\n\n      <HR color=3Dblack noShade SIZE=3D1>\n\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso=\n\n --><FONT \n\ncolor=3D#000000 face=3DVERDANA,ARIAL,HELVETICA size=3D-2><BR></FONT></TD><=\n\n/TR></TABLE><!-- End Calypso --><FONT color=3D#ff0000 \n\nface=3D"Copperplate Gothic Bold" size=3D5 PTSIZE=3D"10">\n\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=3D#ff=

In [60]:
x = [label[0] for label in data_tuple]
y = [label[1] for label in data_tuple]

In [61]:
x

['<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n\n<HTML><HEAD>\n\n<META content=3D"text/html; charset=3Dwindows-1252" http-equiv=3DContent-T=\n\nype>\n\n<META content=3D"MSHTML 5.00.2314.1000" name=3DGENERATOR></HEAD>\n\n<BODY><!-- Inserted by Calypso -->\n\n<TABLE border=3D0 cellPadding=3D0 cellSpacing=3D2 id=3D_CalyPrintHeader_ r=\n\nules=3Dnone \n\nstyle=3D"COLOR: black; DISPLAY: none" width=3D"100%">\n\n  <TBODY>\n\n  <TR>\n\n    <TD colSpan=3D3>\n\n      <HR color=3Dblack noShade SIZE=3D1>\n\n    </TD></TR></TD></TR>\n\n  <TR>\n\n    <TD colSpan=3D3>\n\n      <HR color=3Dblack noShade SIZE=3D1>\n\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso=\n\n --><FONT \n\ncolor=3D#000000 face=3DVERDANA,ARIAL,HELVETICA size=3D-2><BR></FONT></TD><=\n\n/TR></TABLE><!-- End Calypso --><FONT color=3D#ff0000 \n\nface=3D"Copperplate Gothic Bold" size=3D5 PTSIZE=3D"10">\n\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=3D#ff=\n\n000

In [62]:
y

['spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 

In [63]:
X_train , X_test , Y_train , Y_test = train_test_split(x , y , test_size = 0.2)

In [64]:
X_train

['------=_NextPart_000_00B3_51A40A1A.A7486A51\n\nContent-Type: text/html; charset="iso-8859-1"\n\nContent-Transfer-Encoding: base64\n\n\n\n\n\nPCFET0NUWVBFIEhUTUwgUFVCTElDICItLy9XM0MvL0RURCBIVE1MIDQuMDEg\n\nVHJhbnNpdGlvbmFsLy9FTiI+DQo8aHRtbD4NCjxoZWFkPg0KPHRpdGxlPkFk\n\ndWx0IENsYXNzaWZpZWQgMmsyPC90aXRsZT4NCjxtZXRhIGh0dHAtZXF1aXY9\n\nIkNvbnRlbnQtVHlwZSIgY29udGVudD0idGV4dC9odG1sOyBjaGFyc2V0PWlz\n\nby04ODU5LTEiPg0KPC9oZWFkPg0KDQo8Ym9keSBiZ2NvbG9yPSIjRkZGRkZG\n\nIiBsaW5rPSIjQ0M5OTk5IiBsZWZ0bWFyZ2luPSIwIiB0b3BtYXJnaW49IjAi\n\nIG1hcmdpbndpZHRoPSIwIiBtYXJnaW5oZWlnaHQ9IjAiPg0KDQo8Y2VudGVy\n\nPg0KICA8YnI+DQogIDx0YWJsZSB3aWR0aD0iNjAwIiBib3JkZXI9IjAiIGNl\n\nbGxzcGFjaW5nPSIwIiBjZWxscGFkZGluZz0iMSI+DQogICAgPHRyPg0KICAg\n\nICAgPHRkIGJnY29sb3I9IiM5OTAwMDAiPjx0YWJsZSB3aWR0aD0iNjAwIiBi\n\nb3JkZXI9IjAiIGNlbGxzcGFjaW5nPSIwIiBjZWxscGFkZGluZz0iMCI+DQog\n\nICAgICAgICAgPHRyPg0KICAgICAgICAgICAgPHRkIGJnY29sb3I9IiNGRkZG\n\nRkYiPg0KICAgICAgICAgIA0KICAgICAgICAgIDxkaXYgYWxpZ249InJpZ2h0\n\nIj48aW1nIHNyYz0iaHR0cDovL

In [67]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(X_train)

classifier = MultinomialNB()
targets = Y_train
classifier.fit(counts, targets)

MultinomialNB()

In [68]:
example_counts = vectorizer.transform(X_test)
predictions = classifier.predict(example_counts)
predictions

array(['ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham',
       'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam',
       'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'h

In [69]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [70]:
cm = confusion_matrix(Y_test , predictions)
print(cm)
accuracy_score(Y_test ,predictions)

[[496   1]
 [ 20  83]]


0.965

In [71]:
from sklearn.metrics import classification_report
print(classification_report(Y_test , predictions))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       497
        spam       0.99      0.81      0.89       103

    accuracy                           0.96       600
   macro avg       0.97      0.90      0.93       600
weighted avg       0.97      0.96      0.96       600

