## Data loading 

In [45]:
# import data
import pandas as pd
import numpy as np

data = pd.read_csv('sms_spam.csv')
data.head()


Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [46]:
from sklearn.model_selection import  train_test_split

In [47]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['type'], test_size=0.75, random_state=42)

In [65]:
## Requirements to build a spam filter 
# 1. word frequency table 
# 2. Binarized outpout 

## Pre-process

In [48]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [70]:
# Create word frequency table 
count_vector = CountVectorizer()
train_word_count = count_vector.fit_transform(X_train)
list(count_vector.vocabulary_.items())[0:5]

[('not', 2512), ('free', 1521), ('today', 3601), ('haf', 1682), ('pick', 2702)]

In [62]:
# Length of word vector
len(count_vector.vocabulary_)

4060

In [63]:
# Encode data['type'] column into binary form
labelEncoder = LabelEncoder()

In [64]:
y_train = labelEncoder.fit_transform(y_train)
y_test = labelEncoder.fit_transform(y_test)

## Train

In [66]:
from sklearn.naive_bayes import MultinomialNB

In [67]:
clf = MultinomialNB().fit(train_word_count, y_train.ravel()) # Instantiate and fit at a time

## Test

In [79]:
# Create word frequency table for test data
test_word_count = count_vector.transform(X_test)

In [80]:
pred = clf.predict(test_word_count)

In [90]:
# Prediction accuracy check
from sklearn.metrics import average_precision_score
average_precision_score(y_test, pred)

0.89691786350193425

## Sanity Check

In [104]:
clf.predict(count_vector.transform([' win a big prize today']))


array([1], dtype=int64)

In [101]:
clf.predict(count_vector.transform(['I am lazy today']))

array([0], dtype=int64)