In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the dataset
df = pd.read_csv('Spam SMS Collection', sep='\t', names=['label', 'message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## <font color='red'>Separate dependent & independent features</font>

In [4]:
# Separating the independent & dependent features

y = df.label.copy()
x = df.message.copy()

In [5]:
# create dummies of y

y = pd.get_dummies(y).iloc[:, 1]
y = pd.DataFrame(y)
y.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


## <font color='red'>Preprocess the message</font>

In [6]:
x.loc[1]

'Ok lar... Joking wif u oni...'

In [7]:
# Now Lets process x

porter = PorterStemmer()

corpus = list()
for i in range(len(x)):
    message = re.sub(pattern='[^a-zA-Z0-9]', repl=' ', string=x.loc[i])
    message = message.lower()
    message = re.sub(pattern='[\s+]', repl=' ', string=message)
    message = [porter.stem(word) for word in message.split() if not word in stopwords.words('english')]
    message = ' '.join(message)
    corpus.append(message)

In [8]:
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

## <font color='red'>Finding optimum max_features</font>

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

clssifier = MultinomialNB()

scores = dict()
for i in np.linspace(start=1000, stop=10000, num=20):
    cv = CountVectorizer(max_features=int(i))
    new_x = cv.fit_transform(corpus).toarray()
    
    score = cross_val_score(estimator=clssifier,X=new_x, y=y, cv=5, n_jobs=-1, verbose=0)
    scores[i] = score.mean()

In [10]:
scores

{1000.0: 0.9833088856864529,
 1473.6842105263158: 0.9838474853273864,
 1947.3684210526317: 0.984744990379274,
 2421.0526315789475: 0.9858220286448061,
 2894.7368421052633: 0.9861810950720951,
 3368.4210526315787: 0.9854636062828573,
 3842.1052631578946: 0.98456610123097,
 4315.78947368421: 0.9851043788392333,
 4789.473684210527: 0.9845657791982998,
 5263.157894736842: 0.9840275015900364,
 5736.8421052631575: 0.9843864070009902,
 6210.526315789473: 0.9838478073600566,
 6684.210526315789: 0.9836684351627472,
 7157.894736842105: 0.9829504633245042,
 7631.578947368421: 0.9827710911271949,
 8105.263157894737: 0.9827710911271949,
 8578.947368421053: 0.9827710911271949,
 9052.631578947368: 0.9827710911271949,
 9526.315789473683: 0.9827710911271949,
 10000.0: 0.9827710911271949}

<font color='green'>It is seen that max_feature = 2894 gives a good accuracy. So we'll consider that.</font>

## <font color='red'>Finding Optimum ngram_range</font>

In [11]:
ngram_scores = dict()

for i in range(1,10):
    for j in range(i, 10):
        cv = CountVectorizer(max_features=2894, ngram_range=(i, j))
        new_x = cv.fit_transform(corpus).toarray()

        score = cross_val_score(estimator=clssifier,X=new_x, y=y, cv=5, n_jobs=-1, verbose=0)
        ngram_scores[str(i)+','+str(j)] = score.mean()

In [12]:
ngram_scores

{'1,1': 0.9861810950720951,
 '1,2': 0.9858218676284709,
 '1,3': 0.9851037347738929,
 '1,4': 0.9838471632947163,
 '1,5': 0.9843854409029797,
 '1,6': 0.9842059076893351,
 '1,7': 0.9840263744756905,
 '1,8': 0.9842059076893351,
 '1,9': 0.9838468412620461,
 '2,2': 0.9730791958844225,
 '2,3': 0.9673363872764892,
 '2,4': 0.9655415381890492,
 '2,5': 0.9628496670987271,
 '2,6': 0.9632084114933459,
 '2,7': 0.9623109064414586,
 '2,8': 0.9626698118524125,
 '2,9': 0.9615930956195505,
 '3,3': 0.947774673740651,
 '3,4': 0.9380826174815435,
 '3,5': 0.9348526297992932,
 '3,6': 0.9316221590680375,
 '3,7': 0.9296474547342829,
 '3,8': 0.9264174670520324,
 '3,9': 0.9264174670520324,
 '4,4': 0.9386215391551472,
 '4,5': 0.9294690486349841,
 '4,6': 0.9215720024796517,
 '4,7': 0.9208541916577436,
 '4,8': 0.9199568476221913,
 '4,9': 0.9170857653508948,
 '5,5': 0.9334166861228071,
 '5,6': 0.9271352778739402,
 '5,7': 0.9167257328255951,
 '5,8': 0.9142135559652527,
 '5,9': 0.9134955841270097,
 '6,6': 0.92928871033

<font color='green'>It is seen that n_gram=(1,1) gives the best result so we'll consider that</font>

## <font color='red'>Finding Optimum alpha</font>

In [13]:
alpha_score = dict()

for i in [0.0001, 0.0005, 0.001, 0.005, 0.009, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]:
    cv = CountVectorizer(max_features=2894, ngram_range=(1, 1))
    new_x = cv.fit_transform(corpus).toarray()
    
    classifier = MultinomialNB(alpha=i)
    score = cross_val_score(estimator=clssifier,X=new_x, y=y, cv=5, n_jobs=-1, verbose=0)
    alpha_score[i] = score.mean()

In [14]:
alpha_score

{0.0001: 0.9861810950720951,
 0.0005: 0.9861810950720951,
 0.001: 0.9861810950720951,
 0.005: 0.9861810950720951,
 0.009: 0.9861810950720951,
 0.1: 0.9861810950720951,
 0.2: 0.9861810950720951,
 0.4: 0.9861810950720951,
 0.6: 0.9861810950720951,
 0.8: 0.9861810950720951,
 1.0: 0.9861810950720951}

<font color='green'>we'll consider the default alpha value</font>

## <font color='red'>Creating Bag-of-Words</font>

In [15]:
type(x)

pandas.core.series.Series

In [16]:
classifier = MultinomialNB()

cv = CountVectorizer(max_features=2894, ngram_range=(1, 1))
x_vectorized = cv.fit_transform(x).toarray()

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x_vectorized, y, test_size=0.3, random_state=0)

classifier.fit(x_train, y_train)

<IPython.core.display.Javascript object>

MultinomialNB()

In [18]:
y_pred = classifier.predict(x_test)

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9874401913875598
[[1442    9]
 [  12  209]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1451
           1       0.96      0.95      0.95       221

    accuracy                           0.99      1672
   macro avg       0.98      0.97      0.97      1672
weighted avg       0.99      0.99      0.99      1672



## <font color='red'>Handling Imbalanced Data</font>

In [20]:
y.value_counts()

spam
0       4825
1        747
dtype: int64

In [21]:
x.shape, y.shape

((5572,), (5572, 1))

In [26]:
# As our data is imbalanced so we'll use Oversampling
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

os = RandomOverSampler() 
x_balanced, y_balanced = os.fit_sample(x_vectorized, df.iloc[:,0])

print(f"The nmber of classes before the fit {Counter(df.iloc[:,0])}")
print(f"The nmber of classes after the fit {Counter(y_balanced)}")

The nmber of classes before the fit Counter({'ham': 4825, 'spam': 747})
The nmber of classes after the fit Counter({'ham': 4825, 'spam': 4825})


In [27]:
len(x_balanced), len(y_balanced)

(9650, 9650)

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x_balanced, y_balanced,
                                                    test_size=0.3, random_state=0)

classifier.fit(x_train, y_train)

<IPython.core.display.Javascript object>

MultinomialNB()

In [29]:
y_pred = classifier.predict(x_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9789291882556131
[[1419   19]
 [  42 1415]]
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98      1438
        spam       0.99      0.97      0.98      1457

    accuracy                           0.98      2895
   macro avg       0.98      0.98      0.98      2895
weighted avg       0.98      0.98      0.98      2895



<font color='green'>As handling the imbalanced dataset did not improved the accuracy so we'll consider without balancing.</font>

In [30]:
# Creating a pickle file for the CountVectorizer

pickle.dump(cv, open('cv-transform.pkl', 'wb'))

<IPython.core.display.Javascript object>

In [31]:
# Creating a pickle file for the Multinomial Naive Bayes model
filename = 'spam-sms-mnb-model.pkl'
pickle.dump(classifier, open(filename, 'wb'))

<IPython.core.display.Javascript object>