# SMS Spam Classifier (NLP)

In [15]:
# import the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Razi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Razi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# reading the CSV file that is tab separated and has no column names, hence, provided column names as label and messages
df = pd.read_csv("SMSSpamCollection", sep="\t", names=["label", "messages"])
df.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# creating a function that clean the messages, e.g., remove all the special characters, numeric values and convert the text
# to lower case characters
def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text) # replace everything except a-z and A-Z by a space
    text = text.lower() # convert the text to lower case
    return text # return the cleaned text

In [4]:
# create a new column cleaned_messages in the dataframe by applying the clean_text function to the messages column
df["cleaned_messages"] = df["messages"].apply(clean_text)

In [5]:
# show first 10 rows of the dataframe. The data looks clean and in lower case, e.g., there are no exclamation marks in row 8
df.head(10)

Unnamed: 0,label,messages,cleaned_messages
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darling it s been week s n...
6,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...,as per your request melle melle oru minnamin...
8,spam,WINNER!! As a valued network customer you have...,winner as a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...,had your mobile months or more u r entitle...


In [6]:
# eliminating all the stopwords using nltk.stopwords
def eliminate_stopwords(messages):
    lemmetizer = WordNetLemmatizer()
    corpus = [] # an empty list to append the words to
    for i in range(0, len(messages)):
        message = messages[i].split() # split each row into words
        message = [lemmetizer.lemmatize(word) for word in message if word not in stopwords.words("english")] # remove stopwords from each row
        message = " ".join(message) # join all the words
        corpus.append(message) # append all the words into a list
    return corpus # return the list

In [7]:
corpus = eliminate_stopwords(df["cleaned_messages"])

In [8]:
# creating a bag of words vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray() # creating X for independent variables
X.shape

(5572, 7098)

In [9]:
# extracting the target variable
y = np.where(df["label"] == "spam", 1, 0)

In [12]:
# splitting the data in train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4179, 7098), (1393, 7098), (4179,), (1393,))

In [14]:
# train a Naive Bayes model on the train set
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
y_pred = naive_bayes_model.predict(X_test)

In [19]:
# Time to check the accuracy of the model
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1201
           1       0.86      0.95      0.90       192

    accuracy                           0.97      1393
   macro avg       0.92      0.96      0.94      1393
weighted avg       0.97      0.97      0.97      1393



0.9712849964106246

#### The model provides 97% accuracy which is quite high.