In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

In [2]:
messages = pd.read_csv('data/smsspamcollection/SMSSpamCollection',sep='\t',names=['label','message'])
messages.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
messages['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

### Note: Dataset is Unbalanced...
    To Deal with Unbalanced Dataset, different evaluation matrics were used and also 
    tried KFold Cross Validation...

In [4]:
ps = PorterStemmer()
messages_preprocessed = []

In [5]:
for message in messages['message']:
    message = re.sub('[^a-zA-Z]',' ',message)
    message = message.lower()
    words = message.split()
    stemmed_words = [ps.stem(word) for word in words if word not in set(stopwords.words('english'))]
    s = " ".join(stemmed_words)
    messages_preprocessed.append(s)
    

## Using BagOfWords Apporach

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
X = CountVectorizer(max_features=5000).fit_transform(messages_preprocessed).toarray()

In [8]:
y = messages['label'].map({'ham':0,'spam':1})

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=True)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,f1_score,recall_score,precision_score

In [12]:
model = MultinomialNB()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print("Accuracy:",model.score(X_test,y_test)*100)
print("Recall:",recall_score(y_test,predictions))
print("Precision:",precision_score(y_test,predictions))
print("F1-Score:",f1_score(y_test,predictions))
confusion_matrix(y_test,predictions)

Accuracy: 98.06173725771716
Recall: 0.9603960396039604
Precision: 0.9107981220657277
F1-Score: 0.9349397590361446


array([[1172,   19],
       [   8,  194]])

In [13]:
model = LogisticRegression(solver='lbfgs') # Solver is set here to avoid warning!! although lbfgs solver is the default one to avoid warning explictly mentioned here...
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print("Accuracy:",model.score(X_test,y_test)*100)
print("Recall:",recall_score(y_test,predictions))
print("Precision:",precision_score(y_test,predictions))
print("F1-Score:",f1_score(y_test,predictions))
confusion_matrix(y_test,predictions)

Accuracy: 98.27709978463747
Recall: 0.8910891089108911
Precision: 0.989010989010989
F1-Score: 0.9375


array([[1189,    2],
       [  22,  180]])

## Using TfidfVectorizer

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
X = TfidfVectorizer(max_features=5000).fit_transform(messages_preprocessed).toarray()

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=True)

In [17]:
model = MultinomialNB()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print("Accuracy:",model.score(X_test,y_test)*100)
print("Recall:",recall_score(y_test,predictions))
print("Precision:",precision_score(y_test,predictions))
print("F1-Score:",f1_score(y_test,predictions))
confusion_matrix(y_test,predictions)

Accuracy: 96.41062455132807
Recall: 0.75
Precision: 1.0
F1-Score: 0.8571428571428571


array([[1193,    0],
       [  50,  150]])

In [18]:
model = LogisticRegression(solver='lbfgs') # Solver is set here to avoid warning!! although lbfgs solver is the default one to avoid warning explictly mentioned here...
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print("Accuracy:",model.score(X_test,y_test)*100)
print("Recall:",recall_score(y_test,predictions))
print("Precision:",precision_score(y_test,predictions))
print("F1-Score:",f1_score(y_test,predictions))
confusion_matrix(y_test,predictions)

Accuracy: 95.69274946159368
Recall: 0.7
Precision: 1.0
F1-Score: 0.8235294117647058


array([[1193,    0],
       [  60,  140]])

## Using K-Fold Cross Validation

In [19]:
from sklearn.model_selection import KFold

In [20]:
kf = KFold(10)

In [21]:
model = MultinomialNB()
scores = []
for train_index,test_index in kf.split(X):
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    model.fit(X_train,y_train)
    scores.append(model.score(X_test,y_test))
print(scores)

[0.989247311827957, 0.967741935483871, 0.9640933572710951, 0.9838420107719928, 0.9640933572710951, 0.9802513464991023, 0.9694793536804309, 0.9694793536804309, 0.9694793536804309, 0.9802513464991023]


In [22]:
model = LogisticRegression(solver='lbfgs')
scores = []
for train_index,test_index in kf.split(X):
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    model.fit(X_train,y_train)
    scores.append(model.score(X_test,y_test))
print(scores)

[0.9605734767025089, 0.953405017921147, 0.9622980251346499, 0.9838420107719928, 0.9605026929982047, 0.9694793536804309, 0.9569120287253142, 0.9622980251346499, 0.9658886894075404, 0.9694793536804309]


In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=True)
print("Accuracy of Logistic Regression After KFold:",model.score(X_test,y_test))
confusion_matrix(y_test,model.predict(X_test))

Accuracy of Logistic Regression After KFold: 0.9806173725771715


array([[1219,    0],
       [  27,  147]])

#### After KFold Model Prefomance Imporved..!!!

#### Conclusion:
    1) Since Dataset is Unbalenced, after trying f1-score,precision_score,recall_score and KFold Cross
        Validation the accuracy remains almost similar...
    2) Using TfidfVectorizer doesn't imporve accuracy and hence using BagOfWords is quite sufficient...