In [77]:
import pandas as pd

In [78]:
message_data=pd.read_csv('./SMSSpamCollection',sep='\t',names=["label","message"])

In [79]:
import re
import nltk

In [80]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [81]:
ps=PorterStemmer()

In [82]:
%%time
corpus=[]
for i in range(len(message_data)):
    review=re.sub('[^a-zA-Z]',' ',message_data['message'][i])
    review=review.lower()
    review=review.split()
    
    review=[ ps.stem(word) for word in review if word not in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

CPU times: user 8.62 s, sys: 2.38 s, total: 11 s
Wall time: 11 s


In [83]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500)
X=cv.fit_transform(corpus).toarray()

In [84]:
y=pd.get_dummies(message_data['label'],drop_first=True)

In [85]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [86]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model=MultinomialNB()
spam_detect_model.fit(X_train,Y_train)
y_pred=spam_detect_model.predict(X_test)

  return f(**kwargs)


In [87]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(Y_test,y_pred),columns=['SPAM','HAM'],index=['SPAM','HAM'])

Unnamed: 0,SPAM,HAM
SPAM,945,10
HAM,8,152


In [88]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,y_pred)

0.9838565022421525

In [89]:
from sklearn.metrics import precision_score, recall_score
print("Precision = ",precision_score(Y_test,y_pred_tfidf))
print("Recall = ",recall_score(Y_test,y_pred_tfidf))

Precision =  0.9927536231884058
Recall =  0.85625


### Now lets try with lemmetization and check the acuracy

In [90]:
from nltk.stem import WordNetLemmatizer

In [91]:
%%time
lemma=WordNetLemmatizer()

corpus_lemma=[]
for i in range(len(message_data)):
    review=re.sub('[^a-zA-Z]',' ',message_data['message'][i])
    review=review.lower()
    review=review.split()

    review=[ lemma.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review=' '.join(review)
    corpus_lemma.append(review)

CPU times: user 8.88 s, sys: 2.57 s, total: 11.4 s
Wall time: 11.1 s


In [92]:
cv=CountVectorizer(max_features=2500)
X=cv.fit_transform(corpus_lemma)
Y=pd.get_dummies(message_data['label'],drop_first=True)

In [93]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [94]:
from sklearn.naive_bayes import MultinomialNB
lemma_model=MultinomialNB()
lemma_model.fit(X_train,Y_train)

  return f(**kwargs)


MultinomialNB()

In [95]:
y_pred_lemma=lemma_model.predict(X_test)

In [96]:
pd.DataFrame(confusion_matrix(Y_test,y_pred_lemma))

Unnamed: 0,0,1
0,946,9
1,10,150


In [97]:
accuracy_score(Y_test,y_pred_lemma)

0.9829596412556054

In [98]:
from sklearn.metrics import precision_score, recall_score
print("Precision = ",precision_score(Y_test,y_pred_tfidf))
print("Recall = ",recall_score(Y_test,y_pred_tfidf))

Precision =  0.9927536231884058
Recall =  0.85625


In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=2500)
X=tfidf.fit_transform(corpus_lemma)

In [100]:
Y=pd.get_dummies(message_data['label'],drop_first=True)

In [101]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [102]:
tfidf_NB_Model=MultinomialNB()
tfidf_NB_Model.fit(X_train,Y_train)
y_pred_tfidf=tfidf_NB_Model.predict(X_test)

  return f(**kwargs)


In [103]:
pd.DataFrame(confusion_matrix(Y_test,y_pred_tfidf),index=['HAM(0)','SPAM(1)'],columns=['HAM','SPAM'])

Unnamed: 0,HAM,SPAM
HAM(0),954,1
SPAM(1),23,137


In [104]:
accuracy_score(Y_test,y_pred_tfidf)

0.97847533632287

In [105]:
from sklearn.metrics import precision_score, recall_score
print("Precision = ",precision_score(Y_test,y_pred_tfidf))
print("Recall = ",recall_score(Y_test,y_pred_tfidf))

Precision =  0.9927536231884058
Recall =  0.85625


In [106]:
message_data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

### Recall score :- There are actually 23 SPAM, which are wrongly classified as HAM
### Precision score :- There are actually 1 HAM, which  was wrongly classified as SPAM

### In my personal opinion, its ok, if the SPAM is detected as HAM and shown in INBOX, rather than marking the HAM as SPAM and pushing it to SPAM folder. So having higher precision score is more better here.