In [2]:
import pandas as pd

In [3]:
messages = pd.read_csv('SMSSpamCollection',sep='\t', names=['label','message'])

In [4]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [6]:
ps=PorterStemmer()

In [7]:
corpus=[]

In [8]:
for i in range(0, len(messages)):
    review=re.sub('[^a-zA-Z0-9]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if review not in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [9]:
corpus

['go until jurong point crazi avail onli in bugi n great world la e buffet cine there got amor wat',
 'ok lar joke wif u oni',
 'free entri in 2 a wkli comp to win fa cup final tkt 21st may 2005 text fa to 87121 to receiv entri question std txt rate t c s appli 08452810075over18 s',
 'u dun say so earli hor u c alreadi then say',
 'nah i don t think he goe to usf he live around here though',
 'freemsg hey there darl it s been 3 week s now and no word back i d like some fun you up for it still tb ok xxx std chg to send 1 50 to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'as per your request mell mell oru minnaminungint nurungu vettam ha been set as your callertun for all caller press 9 to copi your friend callertun',
 'winner as a valu network custom you have been select to receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hour onli',
 'had your mobil 11 month or more u r entitl to updat to the latest colour mobil wit

In [14]:
## Bow
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=6000)
## max features because if we did not provide this we will get lots of columns hence we are using this rare features will removed top frequent feature

In [15]:
X=cv.fit_transform(corpus).toarray()

In [16]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
X.shape

(5572, 6000)

In [18]:
y=pd.get_dummies(messages['label'])
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [19]:
y=y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=45)

In [21]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4179, 6000), (1393, 6000), (4179,), (1393,))

In [22]:
from sklearn.naive_bayes import MultinomialNB

In [23]:
spam_detect_model=MultinomialNB().fit(X_train,y_train)

In [25]:
y_pred=spam_detect_model.predict(X_test)

In [26]:
y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=uint8)

In [27]:
from sklearn.metrics import confusion_matrix

In [28]:
confusion=confusion_matrix(y_test,y_pred)

In [29]:
confusion

array([[1197,   14],
       [  14,  168]], dtype=int64)

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
accuracy=accuracy_score(y_test,y_pred)

In [32]:
accuracy

0.9798994974874372

# Using lemmatization and TFIDF

In [33]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [34]:
corpus=[]

In [35]:
for i in range(0, len(messages)):
    review=re.sub('[^a-zA-Z0-9]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[lemmatizer.lemmatize(word) for word in review if review not in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [36]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry question std txt rate t c s apply 08452810075over18 s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been 3 week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send 1 50 to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press 9 to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hour only',
 'had your mobile 11 month or more u r entitled to update to th

In [42]:
## TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [43]:
X1=cv.fit_transform(corpus).toarray()

In [44]:
X1.shape

(5572, 6000)

In [46]:
y1=pd.get_dummies(messages['label'])
y1=y1.iloc[:,1].values
y1

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1,y1,test_size=0.25,random_state=45)

In [52]:
spam_detect_model2=MultinomialNB().fit(X_train,y_train)

In [53]:
y_pred1=spam_detect_model2.predict(X_test)

In [54]:
y_pred1

array([0, 0, 0, ..., 1, 0, 1], dtype=uint8)

In [55]:
confusion=confusion_matrix(y_test,y_pred1)
confusion

array([[1198,   13],
       [  16,  166]], dtype=int64)

In [56]:
accuracy=accuracy_score(y_test,y_pred1)
accuracy

0.9791816223977028