In [1]:
#importing the dataset
import pandas as pd
message = pd.read_csv("SMSSpamCollection",sep='\t',names=["label","message"])

In [2]:
message

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


#### Data cleaning and preprocessing

In [3]:
import re  #this library is used for regular expression
import nltk 

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [5]:
ps = PorterStemmer()
corpus = []

In [6]:
for i in range(0,len(message)):
    #i will remove all the character except a to z or
    #A to Z and replace it with blank 
    review = re.sub('[^a-zA-Z]',' ',message['message'][i])
    review = review.lower()
    review = review.split()
    #stemming
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus[0:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [10]:
#creating the bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
x = cv.fit_transform(corpus).toarray()

In [11]:
x[0:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
y = pd.get_dummies(message['label'])
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
5,0,1
6,1,0
7,1,0
8,0,1
9,0,1


In [16]:
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [17]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [20]:
#it does work very well for nlp problem
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [21]:
y_pred = spam_detect_model.predict(X_test)

In [23]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)

In [24]:
confusion_m

array([[946,   9],
       [  8, 152]])

In [25]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test,y_pred)
score

0.9847533632286996