###Naive Bayes model to classify future SMS messages as either spam or ham

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [3]:
df= pd.read_csv("SMSSpamCollection",sep='\t', names=['spam', 'txt'])

In [4]:
df.head()


Unnamed: 0,spam,txt
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['spam'] = pd.get_dummies(df.spam)['spam']


In [6]:
df.head()


Unnamed: 0,spam,txt
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
stopset = set(stopwords.words('english'))

In [12]:
vectorizer = TfidfVectorizer(use_idf  =True, lowercase = True, 
                             strip_accents = 'ascii', stop_words =stopset)

In [13]:
y = df.spam

In [14]:
X = vectorizer.fit_transform(df.txt)


In [15]:
print y.shape
print X.shape

(5572L,)
(5572, 8605)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

###Naive Bayes Classifier

In [17]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
#roc_auc_score measure
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.98558587451336732

In [19]:
import numpy as np
sms_test_array = np.array(["I've been searching for the right words to thank you for this breather."])

sms_test_array = vectorizer.transform(sms_test_array)

print clf.predict(sms_test_array)

[ 0.]


In [20]:
import numpy as np
sms_test_array = np.array(["Todays Voda numbers ending 7548 are selected to receive a $350 award. If you have a match please call 08712300220 quoting claim code 4041 standard rates app."])

sms_test_array = vectorizer.transform(sms_test_array)

print clf.predict(sms_test_array)

[ 1.]
