In [223]:
from __future__ import division

import string
import re

import pandas as pd
import numpy as np

from collections import Counter

In [224]:
def remove_punctuation(s):
    """see http://stackoverflow.com/questions/265960/best-way-to-\
    strip-punctuation-from-a-string-in-python"""
    table = s.maketrans("", "", string.punctuation)
    return s.translate(table)

In [225]:
def clean(x):
    """ Wrapper function to remove punctuation and tokenize """
    x = remove_punctuation(x)
    return " ".join(re.split("\W+", x))

In [226]:
def naive(document):
    """ term frequency implementation of naive bayes. Takes a
    vector and classifies it as either 'ham' or 'spam' """
    prob_spam = []
    prob_ham = []
    for word in document:
        tf_ham = (ham_vocab[word] if word in ham_vocab.keys() else 1/word_count)
        tf_spam = (spam_vocab[word] if word in spam_vocab.keys() else 1/word_count)

        prob_spam.append(tf_spam)
        prob_ham.append(tf_ham)
    if np.prod(prob_ham)*prior_ham > np.prod(prob_spam)*prior_spam:
        return "ham"
    else:
        return "spam"

In [227]:
df = pd.read_csv("SMSSpamCollection.csv", sep="\t", header=None,
                     names=["type", "text"])
df.shape

(5572, 2)

In [228]:
#df.text = df.text.apply(clean)
df.text = df.text.str.lower()

In [229]:
prior_ham = len(df[df['type']=='ham'])/len(df)
prior_spam = len(df[df['type']=='spam'])/len(df)
prior_ham, prior_spam

(0.8659368269921034, 0.13406317300789664)

In [230]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2)

In [231]:
ham_vocab = Counter(" ".join(list(train[train['type']=='ham'].text)).split())
spam_vocab = Counter(" ".join(list(train[train['type']=='spam'].text)).split())
Total_vocab = spam_vocab + ham_vocab
spam_len = sum(spam_vocab.values())
ham_len = sum(ham_vocab.values())  
word_count = spam_len + ham_len

spam_vocab = {k:(v+1)/word_count for k, v in spam_vocab.items()}
ham_vocab = {k:(v+1)/word_count for k, v in ham_vocab.items()}


word_count, spam_len, ham_len

(69443, 14827, 54616)

In [232]:
pred = test["text"].apply(run_bayes)

In [233]:
pd.crosstab(pred, test["type"])

type,ham,spam
text,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,983,59
spam,3,70


In [234]:
acc = sum(pred == test["type"])/len(test)
print("Accuracy is: %s%%" % round(acc*100, 2))

Accuracy is: 94.44%
