**Context**

The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. 
It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.

**Objective**

To classify the messages as Spam or Ham using NLP.

<h1>Importing Libraries</h1>

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

<h1>Loading Data</h1>

In [2]:
data = pd.read_csv('spam.csv', encoding='Latin-1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [5]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.rename(columns= {"v1":"label", "v2":"message"}, inplace=True) 

In [7]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


<h1>Handling Categorical Data</h1>

In [8]:
data = pd.get_dummies(data, columns=['label'])

In [9]:
data.head()

Unnamed: 0,message,label_ham,label_spam
0,"Go until jurong point, crazy.. Available only ...",1,0
1,Ok lar... Joking wif u oni...,1,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,U dun say so early hor... U c already then say...,1,0
4,"Nah I don't think he goes to usf, he lives aro...",1,0


In [10]:
data['label_ham'].value_counts()

1    4825
0     747
Name: label_ham, dtype: int64

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   message     5572 non-null   object
 1   label_ham   5572 non-null   uint8 
 2   label_spam  5572 non-null   uint8 
dtypes: object(1), uint8(2)
memory usage: 54.5+ KB


In [12]:
data['count'] = 0
for i in np.arange(0, len(data.message)):
  data.loc[i, 'count'] = len(data.loc[i, 'message'])

In [13]:
data.head()

Unnamed: 0,message,label_ham,label_spam,count
0,"Go until jurong point, crazy.. Available only ...",1,0,111
1,Ok lar... Joking wif u oni...,1,0,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1,155
3,U dun say so early hor... U c already then say...,1,0,49
4,"Nah I don't think he goes to usf, he lives aro...",1,0,61


In [14]:
data.describe()

Unnamed: 0,label_ham,label_spam,count
count,5572.0,5572.0,5572.0
mean,0.865937,0.134063,80.118808
std,0.340751,0.340751,59.690841
min,0.0,0.0,2.0
25%,1.0,0.0,36.0
50%,1.0,0.0,61.0
75%,1.0,0.0,121.0
max,1.0,1.0,910.0


<h1>Processing Message</h1>

In [15]:
data['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

**Preparing Word Vector Corpus**

In [16]:
corpus = []

**Using Porter Stemmer**

In [17]:
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
ps = PorterStemmer()

In [19]:
for i in range(0, 5572):
  #regular expressions
  msg = data['message'][i]
  #deal with email addresses
  msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', data['message'][i])
  #urls
  msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', data['message'][i])
  #money symbols
  msg = re.sub('([A-Z]{3}|[A-Z]?[\$€¥])?\s?(\d{1,3}((,\d{1,3})+)?(.\d{1,3})?(.\d{1,3})?(,\d{1,3})?)', 'moneysymb', data['message'][i])
  #phone numbers
  msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', data['message'][i])
  #numbers
  msg = re.sub('\d+(\.\d+)?', 'numbr', data['message'][i])

  #Removing punctuations
  msg = re.sub('[^\w\d\s]', ' ', data['message'][i])
  if i==0:
    print("\t\t\t\t Message", i)

  if i==0:
    print("\n After Regular Expression - Message ", i, " : ", msg)

  #Each Word to lowercase
  msg = msg.lower()
  if i==0:
    print("\n Lower case Message ", i, " : ", msg)
  
  #Splitting words 
  msg = msg.split()    
  if i==0:
    print("\n After Splitting Message ", i, " : ", msg)
    
  #Stemming with PorterStemmer handling Stop Words
  msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
  if i==0:
    print("\n After Stemming Message ", i, " : ", msg)
    
  # preparing Messages with Remaining Tokens
  msg = ' '.join(msg)
  if i==0:
    print("\n Final Prepared Message ", i, " : ", msg, "\n\n")
    
  # Preparing WordVector Corpus
  corpus.append(msg)

				 Message 0

 After Regular Expression - Message  0  :  Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   

 Lower case Message  0  :  go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   

 After Splitting Message  0  :  ['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']

 After Stemming Message  0  :  ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']

 Final Prepared Message  0  :  go jurong point crazi avail bugi n great world la e buffet cine got amor wat 




<h1>Preparing Vectors for Each Message</h1>

In [20]:
cv = CountVectorizer()

In [21]:
cv

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [22]:
#converting messages to numeric form
data_input = cv.fit_transform(corpus).toarray()

In [23]:
data_input[0]

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
data_input

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
data_input.shape

(5572, 7213)

<h1>Applying Classification</h1>

> 

*   **Input: Prepared Sparse Matrix/Vectors for Each Message**
*   **Output: Label i.e. Spam or Ham**





In [26]:
data.head()

Unnamed: 0,message,label_ham,label_spam,count
0,"Go until jurong point, crazy.. Available only ...",1,0,111
1,Ok lar... Joking wif u oni...,1,0,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1,155
3,U dun say so early hor... U c already then say...,1,0,49
4,"Nah I don't think he goes to usf, he lives aro...",1,0,61


In [27]:
data_output = data['label_ham']

In [28]:
data_output.value_counts()

1    4825
0     747
Name: label_ham, dtype: int64

**Data Splitting**

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
train_x, test_x, train_y, test_y = train_test_split(data_input, data_output, test_size=0.20, random_state=0)

<h1>ML Model</h1>

In [31]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

**Training**

In [32]:
nvb = GaussianNB()
nvb.fit(train_x, train_y)

dec = tree.DecisionTreeClassifier()
dec.fit(train_x, train_y)

# rf = RandomForestClassifier(n_estimators=200)
# rf.fit(train_x, train_y)

**Predictions**

In [33]:
pred_nvb = nvb.predict(test_x)
pred_dec = dec.predict(test_x)

In [34]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(test_y, pred_nvb))
print (classification_report(test_y, pred_nvb))

Accuracy : 0.87085 


              precision    recall  f1-score   support

           0       0.54      0.89      0.67       166
           1       0.98      0.87      0.92       949

    accuracy                           0.87      1115
   macro avg       0.76      0.88      0.80      1115
weighted avg       0.91      0.87      0.88      1115



In [35]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(test_y, pred_dec))
print (classification_report(test_y, pred_dec))

Accuracy : 0.97130 


              precision    recall  f1-score   support

           0       0.97      0.83      0.90       166
           1       0.97      1.00      0.98       949

    accuracy                           0.97      1115
   macro avg       0.97      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [36]:
# print ("Accuracy : %0.5f \n\n" % accuracy_score(test_y, pred_rf))
# print (classification_report(test_y, pred_rf))

<h1>Final Accuracy</h1>


> 

*   **Random Forest : 97.220%**
*   **Decision Tree : 97.040%**
*   **GaussianNB : 87.085%**







<h1>Hyperparameter Tuning for Random Forest</h1>

In [40]:
from sklearn.model_selection import GridSearchCV

In [42]:
params = {
  'bootstrap': [True, False],
 'max_depth': [4, 6, 8, 10, 12, 16, None],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [100, 200, 300, 400, 500, 600],
 }
random_rf = GridSearchCV(estimator = rf, param_grid = params, cv = 3, verbose=2, n_jobs = -1)
random_rf.fit(train_x, train_y)

Fitting 3 folds for each of 168 candidates, totalling 504 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 53.5min
[Parallel(n_jobs=-1)]: Done 504 out of 504 | elapsed: 116.5min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False,
                                              rando

In [43]:
rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False)
rf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [44]:
pred_rf = rf.predict(test_x)
print ("Accuracy : %0.5f \n\n" % accuracy_score(test_y, pred_rf))
print (classification_report(test_y, pred_rf))

Accuracy : 0.97309 


              precision    recall  f1-score   support

           0       1.00      0.82      0.90       166
           1       0.97      1.00      0.98       949

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



Accuracy increased by:
**0.089%**

<h1> KFold Cross validation</h1>

In [45]:
from sklearn.model_selection import KFold

In [46]:
kfold = KFold(n_splits=5, random_state=0, shuffle=False)



In [47]:
# preparing a fresh random forest
rf1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False)
accuracy = []
for train_index,test_index in kfold.split(data_input):
  xtrain,xtest = data_input[train_index], data_input[test_index]
  ytrain,ytest = data_output[train_index], data_output[test_index]

  rf1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False)
  rf1.fit(xtrain, ytrain)

  prediction = rf1.predict(xtest)
  accuracy.append(accuracy_score(ytest, prediction))


In [48]:
accuracy

[0.9766816143497757,
 0.97847533632287,
 0.9757630161579892,
 0.9712746858168761,
 0.9748653500897666]

In [49]:
np.mean(accuracy)

0.9754120005474555

*Maximum Accuracy Score:* **97.541%**