# Problem Statement :

# SMS Spam Detection using Natural Language Processing with Python

NLP is commonly used in text classification task such as spam detection and sentiment analysis, text generation, language translations and document classification.

# Required Libraries

In [102]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# 1. Data Gathering

In [70]:
df = pd.read_csv("SMSSpamCollection", sep = '\t', names=['Label','Msg'])
df.head()

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# 2. Exploratory Data Analysis (EDA)

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Label    5572 non-null object
Msg      5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [100]:
df.shape

(5572, 2)

In [101]:
df.isna().sum()

Label    0
Msg      0
dtype: int64

In [74]:
df['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

# 3. Data Pre-Processing

In [75]:
lm = WordNetLemmatizer()
stopword = stopwords.words('english')
corpus = []
for data in range (len(df)):
    review = re.sub('^a-zA-Z0-9',' ',df['Msg'][data])
    review = review.lower()
    review = review.split()
    review = [x for x in review if x not in stopwords.words('english')]
    review = [lm.lemmatize(x) for x in review]
    review = " ".join(review)
    corpus.append(review)    

In [77]:
len(corpus)

5572

In [78]:
df['Msg']=corpus

In [79]:
df.head()

Unnamed: 0,Label,Msg
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"


# 4. Model Building

## 4.1 Data Splitting into the Train and Test Data

In [80]:
x = df['Msg']

In [81]:
y = df['Label']

In [83]:
x_train, x_test, y_train, y_test = train_test_split (x,y,train_size = 0.33)

## 4.2 Vectorization (Convert Text Data Into The Vector)

In [84]:
tf_obj = TfidfVectorizer() 
x_train_tfidf = tf_obj.fit_transform(x_train).toarray()
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [85]:
x_train_tfidf.shape

(1838, 4557)

# 4.3 Pipelining

In [91]:
text_mnb = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [92]:
text_mnb.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [105]:
y_pred_train = text_mnb.predict(x_train)
print("Accuracy Score", accuracy_score(y_train, y_pred_train)*100)

Accuracy Score 97.55168661588684


In [106]:
y_pred_test = text_mnb.predict(x_test)
print("Accuracy Score", accuracy_score(y_test, y_pred_test)*100)

Accuracy Score 94.64381360471344


In [107]:
con_mat_test = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix On Training Data Set:\n",con_mat_test)

Confusion Matrix On Training Data Set:
 [[3228    0]
 [ 200  306]]


In [108]:
class_rep_test = classification_report(y_test, y_pred_test)
print("Classification Report On Testing Data Set:\n",class_rep_test)

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

         ham       0.94      1.00      0.97      3228
        spam       1.00      0.60      0.75       506

    accuracy                           0.95      3734
   macro avg       0.97      0.80      0.86      3734
weighted avg       0.95      0.95      0.94      3734



# Prediction On User Data

In [150]:
def Preprocess_data(text):
    lm = WordNetLemmatizer()
    review = re.sub('^a-zA-Z0-9',' ',text)
    review = review.lower()
    review = review.split()
    review = [x for x in review if x not in stopwords.words('english')]
    review = [lm.lemmatize(x) for x in review]
    review = " ".join(review)
    return [review]
    

In [152]:
data = '''free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's 
apply 08452810075over18's'''
data = Preprocess_data(data)
data

["free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's apply 08452810075over18's"]

In [155]:
text_mnb.predict(data)[0]

'spam'

In [144]:
class Prediction:
    
    def __init__(self,data):
        self.data = data
        
    def user_data_preprocessing(self):
        lm = WordNetLemmatizer()
        review = re.sub('^a-zA-Z0-9',' ',self.data)
        review = review.lower()
        review = review.split()
        review = [x for x in review if x not in stopwords.words('english')]
        review = [lm.lemmatize(x) for x in review]
        review = " ".join(review)
        return [review]
    
    def user_data_prediction(self):
        preprocess_data = self.user_data_preprocessing()
                
        if text_mnb.predict(preprocess_data)[0] =='spam':
            return "The SMS Is Spam"
        else:
            return "The SMS Is Ham"          

In [145]:
data = "Free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's apply 08452810075over18's"
Prediction(data).user_data_prediction()

'The SMS Is Spam'

In [146]:
data_1 = df['Msg'][1]
print(data_1)
Prediction(data_1).user_data_prediction()

ok lar... joking wif u oni...


'The SMS Is Ham'