# Problem Statement :

# SMS Spam Detection using Natural Language Processing with Python

NLP is commonly used in text classification task such as spam detection and sentiment analysis, text generation, language translations and document classification.

# Required Libraries

In [102]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# 1. Data Gathering

In [70]:
df = pd.read_csv("SMSSpamCollection", sep = '\t', names=['Label','Msg'])
df.head()

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# 2. Exploratory Data Analysis (EDA)

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Label    5572 non-null object
Msg      5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [100]:
df.shape

(5572, 2)

In [101]:
df.isna().sum()

Label    0
Msg      0
dtype: int64

In [74]:
df['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

# 3. Data Pre-Processing

In [75]:
lm = WordNetLemmatizer()
stopword = stopwords.words('english')
corpus = []
for data in range (len(df)):
    review = re.sub('^a-zA-Z0-9',' ',df['Msg'][data])
    review = review.lower()
    review = review.split()
    review = [x for x in review if x not in stopwords.words('english')]
    review = [lm.lemmatize(x) for x in review]
    review = " ".join(review)
    corpus.append(review)    

In [77]:
len(corpus)

5572

In [78]:
df['Msg']=corpus

In [79]:
df.head()

Unnamed: 0,Label,Msg
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"


# 4. Model Building

## 4.1 Data Splitting into the Train and Test Data

In [80]:
x = df['Msg']

In [81]:
y = df['Label']

In [83]:
x_train, x_test, y_train, y_test = train_test_split (x,y,train_size = 0.33)

## 4.2 Vectorization (Convert Text Data Into The Vector)

In [84]:
tf_obj = TfidfVectorizer() 
x_train_tfidf = tf_obj.fit_transform(x_train).toarray()
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [85]:
x_train_tfidf.shape

(1838, 4557)

# 4.3 Pipelining

In [91]:
text_mnb = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [92]:
text_mnb.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [18]:
tf_obj = TfidfVectorizer() 
x = tf_obj.fit_transform(corpus).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
y = pd.get_dummies(df['Label'],drop_first = True).to_numpy()
y 

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]], dtype=uint8)

In [33]:
x_train, x_test, y_train, y_test = train_test_split (x,y,train_size = 0.2,stratify = y)

In [34]:
len(x_train),len(y_train)

(1114, 1114)

In [35]:
len(x_test),len(y_test)

(4458, 4458)

In [37]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [38]:
y_pred_train = rf.predict(x_train)
print("Accuracy Score", accuracy_score(y_train, y_pred_train))

Accuracy Score 0.9937163375224417


In [39]:
y_pred_test = rf.predict(x_test)
print("Accuracy Score", accuracy_score(y_test, y_pred_test))

Accuracy Score 0.9481830417227456


In [51]:
class Evaluation:
    
    def __init__(self,model,x_train,x_test,y_train,y_test):
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        self.model = model
        
    def train_evaluation(self):
        y_pred_train = self.model.predict(self.x_train)
        
        acc_scr_train = accuracy_score(self.y_train,y_pred_train )*100
        print("Accuracy Score On Training Data Set:",acc_scr_train)
        print()
        
        con_mat_train = confusion_matrix(self.y_train,y_pred_train )
        print("Confusion Matrix On Training Data Set:\n",con_mat_train)
        print()
        
        class_rep_train = classification_report(self.y_train,y_pred_train )
        print("Classification Report On Training Data Set:\n",class_rep_train)
        print()
        
        
    def test_evaluation(self):
        y_pred_test = self.model.predict(self.x_test)
        
        acc_scr_test = accuracy_score(self.y_test,y_pred_test )*100
        print("Accuracy Score On Testing Data Set:",acc_scr_test)
        print()
        
        con_mat_test = confusion_matrix(self.y_test,y_pred_test )
        print("Confusion Matrix On Testing Data Set:\n",con_mat_test)
        print()
        
        class_rep_test = classification_report(self.y_test,y_pred_test )
        print("Classification Report On Testing Data Set:\n",class_rep_test)
        print()

In [52]:
Evaluation(rf,x_train,x_test,y_train,y_test).train_evaluation()

Accuracy Score On Training Data Set: 99.37163375224416

Confusion Matrix On Training Data Set:
 [[965   0]
 [  7 142]]

Classification Report On Training Data Set:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       965
           1       1.00      0.95      0.98       149

    accuracy                           0.99      1114
   macro avg       1.00      0.98      0.99      1114
weighted avg       0.99      0.99      0.99      1114




In [53]:
Evaluation(rf,x_train,x_test,y_train,y_test).test_evaluation()

Accuracy Score On Testing Data Set: 94.81830417227457

Confusion Matrix On Testing Data Set:
 [[3856    4]
 [ 227  371]]

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97      3860
           1       0.99      0.62      0.76       598

    accuracy                           0.95      4458
   macro avg       0.97      0.81      0.87      4458
weighted avg       0.95      0.95      0.94      4458




In [54]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
Evaluation(lr,x_train,x_test,y_train,y_test).train_evaluation()

Accuracy Score On Training Data Set: 91.38240574506284

Confusion Matrix On Training Data Set:
 [[964   1]
 [ 95  54]]

Classification Report On Training Data Set:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95       965
           1       0.98      0.36      0.53       149

    accuracy                           0.91      1114
   macro avg       0.95      0.68      0.74      1114
weighted avg       0.92      0.91      0.90      1114




In [56]:
Evaluation(lr,x_train,x_test,y_train,y_test).test_evaluation()

Accuracy Score On Testing Data Set: 89.03095558546433

Confusion Matrix On Testing Data Set:
 [[3857    3]
 [ 486  112]]

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      3860
           1       0.97      0.19      0.31       598

    accuracy                           0.89      4458
   macro avg       0.93      0.59      0.63      4458
weighted avg       0.90      0.89      0.86      4458




In [57]:
from sklearn.tree import DecisionTreeClassifier

In [58]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [59]:
Evaluation(dt,x_train,x_test,y_train,y_test).train_evaluation()

Accuracy Score On Training Data Set: 100.0

Confusion Matrix On Training Data Set:
 [[965   0]
 [  0 149]]

Classification Report On Training Data Set:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       965
           1       1.00      1.00      1.00       149

    accuracy                           1.00      1114
   macro avg       1.00      1.00      1.00      1114
weighted avg       1.00      1.00      1.00      1114




In [60]:
Evaluation(lr,x_train,x_test,y_train,y_test).test_evaluation()

Accuracy Score On Testing Data Set: 89.03095558546433

Confusion Matrix On Testing Data Set:
 [[3857    3]
 [ 486  112]]

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      3860
           1       0.97      0.19      0.31       598

    accuracy                           0.89      4458
   macro avg       0.93      0.59      0.63      4458
weighted avg       0.90      0.89      0.86      4458




In [61]:
from sklearn.naive_bayes import MultinomialNB

In [62]:
nb_obj = MultinomialNB()
nb_obj.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [63]:
Evaluation(nb_obj,x_train,x_test,y_train,y_test).train_evaluation()

Accuracy Score On Training Data Set: 93.98563734290843

Confusion Matrix On Training Data Set:
 [[965   0]
 [ 67  82]]

Classification Report On Training Data Set:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       965
           1       1.00      0.55      0.71       149

    accuracy                           0.94      1114
   macro avg       0.97      0.78      0.84      1114
weighted avg       0.94      0.94      0.93      1114




In [64]:
Evaluation(nb_obj,x_train,x_test,y_train,y_test).test_evaluation()

Accuracy Score On Testing Data Set: 90.30955585464334

Confusion Matrix On Testing Data Set:
 [[3860    0]
 [ 432  166]]

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95      3860
           1       1.00      0.28      0.43       598

    accuracy                           0.90      4458
   macro avg       0.95      0.64      0.69      4458
weighted avg       0.91      0.90      0.88      4458




In [None]:
Compare Accuracy with different Models

                             Train Accuracy        Testing Accuracy                       
RandomForest Algorithm           99.37                  94.81

Logistic Regression              91.38                  89.03

Decision Tree                    100                    89.03

Naive Bayes (MultinominalNB)     93.98                  90.30