Goal:  Train a Naive Bayes model to classify future SMS messages as either spam or ham.


In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
import numpy as np

In [2]:
SMS= pd.read_csv("SMSSpamCollection",sep='\t', names=['spam', 'txt'])

In [3]:
SMS.head(10)

Unnamed: 0,spam,txt
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
#lets convert values in spam column to numerics 
SMS['spam'] = pd.get_dummies(SMS.spam)['spam']

In [5]:
SMS.head(10)

Unnamed: 0,spam,txt
0,0.0,"Go until jurong point, crazy.. Available only ..."
1,0.0,Ok lar... Joking wif u oni...
2,1.0,Free entry in 2 a wkly comp to win FA Cup fina...
3,0.0,U dun say so early hor... U c already then say...
4,0.0,"Nah I don't think he goes to usf, he lives aro..."
5,1.0,FreeMsg Hey there darling it's been 3 week's n...
6,0.0,Even my brother is not like to speak with me. ...
7,0.0,As per your request 'Melle Melle (Oru Minnamin...
8,1.0,WINNER!! As a valued network customer you have...
9,1.0,Had your mobile 11 months or more? U R entitle...


In [6]:
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [7]:
#in this case, the variable will be liked as 0 (that is neither SPAM or HAM) or 1(that is SPAM or HAM)
y = SMS.spam

In [8]:
#convert text to features
X= vectorizer.fit_transform(SMS.txt)

In [9]:
print y.shape
print X.shape

(5572L,)
(5572, 8587)


In [10]:
X_train, X_test,y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
#Lets train the naive bayes classifier
classifier = naive_bayes.MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
roc_auc_score(y_test, classifier.predict_proba(X_test)[:,1])

0.98589322144123448

In [14]:
sms_array=np.array(["This is an example sentence"])
sms_review_vector = vectorizer.transform(sms_array)
print classifier.predict(sms_review_vector)

[ 0.  0.]


The line - "This is an example sentence" is predicted as not SPAM as the value is '0'
