# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [2]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('/users/ritwikchakradhar/Downloads/MLCourse/emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('/users/ritwikchakradhar/Downloads/MLCourse/emails/ham', 'ham'))


Let's have a look at that DataFrame:

In [5]:
data.sample(5)

Unnamed: 0,message,class
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/01938.da956f847b734888d96d8dd4ee736f40,URL: http://www.joelonsoftware.com/news/200209...,ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/spam/00292.dbf78a2aaa230d288eb80ab843804252,"REGISTER .COM, .BIZ, AND .INFO DOMAINS FOR ONL...",spam
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/01403.240c78274ebf62c726f214797242b409,\n\n--hdW7zL/qDS6RXdAL\n\nContent-Type: text/p...,ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/01991.27c828be78e15dbf223bb3f3944508d6,"URL: http://www.newsisfree.com/click/-4,827608...",ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/01656.705cfa5ceb056324fe8fef48d12754db,> I guess MUA-level filtering is just a fallba...,ham


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [6]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB()

Let's try it out:

In [12]:
examples = ['Hello is there anybody in there?', "Get Free Best Deals Now!"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['ham', 'ham'], dtype='<U4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['class'], test_size = 0.2)

In [53]:
print(y_train)

/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/00827.b863d1780c6c6ed248a3e9136bd52b72      ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/02146.08b5ca9cb17ad30a0295e17560307aeb      ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/00425.0ba16e840d94d629f8a3881b4e03a3ad      ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/02274.d25fe5d3adc798112cd281bd89d642db      ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/00719.6d73cfb0bea6fe5002fbd3b260d480e6      ham
                                                                                                 ... 
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/00628.4a185fee450d8239cf82bee902dbd1af      ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/00911.dcbdde154d9f25c1afe32f4b8f5f1f9b      ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/ham/00378.46432f84e1aab28c26cf1bc5aa2d36bc      ham
/users/ritwikchakradhar/Downloads/MLCourse/emails/spam/00156.0b541afe96820e3bb8f90

In [61]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(X_train.values)

classifier = MultinomialNB()
targets = y_train.values
classifier.fit(counts, targets)

MultinomialNB()

In [55]:

counts_test = vectorizer.transform(X_test)

prediction = classifier.predict(counts_test)

In [56]:
from sklearn.metrics import r2_score

In [57]:
per_predict = 0

for x, y in prediction:
    if(x==y):
        per_predict +=1
        
print(per_predict/y_test.count())

ValueError: too many values to unpack (expected 2)

In [58]:
import numpy as np
print(np.bincount(prediction == y_test))

[ 28 572]


In [59]:
unique, counts = np.unique(prediction == y_test, return_counts=True)

print(np.asarray((unique, counts)).T)

[[  0  28]
 [  1 572]]


In [62]:
print(counts)

  (0, 33877)	3
  (0, 18578)	1
  (0, 48951)	1
  (0, 22897)	1
  (0, 46670)	1
  (0, 28672)	9
  (0, 31894)	1
  (0, 49335)	7
  (0, 46623)	2
  (0, 18023)	1
  (0, 10381)	2
  (0, 9560)	8
  (0, 11520)	4
  (0, 27755)	4
  (0, 48860)	8
  (0, 39305)	1
  (0, 40279)	1
  (0, 36954)	4
  (0, 9764)	1
  (0, 38366)	1
  (0, 27911)	2
  (0, 55840)	1
  (0, 14474)	1
  (0, 20793)	1
  (0, 53667)	1
  :	:
  (2399, 28632)	1
  (2399, 32040)	4
  (2399, 24608)	1
  (2399, 27587)	2
  (2399, 27158)	3
  (2399, 33074)	1
  (2399, 32092)	1
  (2399, 50768)	1
  (2399, 47455)	1
  (2399, 33106)	1
  (2399, 32096)	1
  (2399, 50648)	1
  (2399, 15041)	1
  (2399, 44734)	1
  (2399, 25718)	1
  (2399, 34574)	1
  (2399, 14269)	1
  (2399, 28689)	1
  (2399, 19198)	1
  (2399, 16581)	1
  (2399, 42782)	1
  (2399, 48913)	1
  (2399, 2341)	1
  (2399, 53747)	1
  (2399, 26108)	1
