In [1]:
# Importing Necessary Libraries
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [2]:
# Read CSV file
data = pd.read_csv('spam.csv', encoding='latin1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.shape

(5572, 5)

In [5]:
# Check for Null values
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [6]:
data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [7]:
# Transforming data into numerical form
label_encoder = LabelEncoder()
data["label"] = label_encoder.fit_transform(data['v1'])

In [8]:
data.rename(columns={'label':'Label','v2':'Text'}, inplace=True)

In [9]:
data.head(10)

Unnamed: 0,v1,Text,Label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [10]:
data['Label'].value_counts()

0    4825
1     747
Name: Label, dtype: int64

In [11]:
# Calling Porter Stemmer Function.
stemmer = PorterStemmer()

# Calling and storing the set of stopwords in the variable 'stopwords_set'.
stopwords_set = set(stopwords.words('english'))

# Function for preprocessing the given text.
def text_cleaner(text):
    text = text.translate(str.maketrans("", "", string.punctuation)).split() # Removes all the punctuations.
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = ' '.join(text)
    
    return text

In [12]:
corpus = []

# Preprocessing the text and storing it in the corpus.
for i in range(len(data)):
    text = data.Text.iloc[i].lower()
    clean_text = text_cleaner(text)
    corpus.append(clean_text)

In [13]:
data.Text.iloc[2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [14]:
corpus[2]

'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18'

In [15]:
# Performs TF-IDF Vectorization.
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(corpus).toarray()
y = data.Label

# Splitting the data.
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
y_train.shape

(4457,)

In [17]:
# Calling and fitting the model .
model = GaussianNB()
model.fit(x_train, y_train)

In [18]:
# Model Accuracy.
model.score(x_test, y_test)

0.8708520179372198

In [19]:
sms = data.Text.values[8]
sms

'WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

In [20]:
# Performs preprocessing on a sample data.
clean_sms = text_cleaner(sms)
sms_corpus = [clean_sms]
my_sms = vectorizer.transform(sms_corpus).toarray()

In [21]:
mydict = {0:'Not Spam', 1:'Spam'}
data.Label.iloc[8]

1

In [22]:
# Making Prediction.
print(f'The Model predicted the SMS as {mydict[int(model.predict(my_sms))]}.')

The Model predicted the SMS as Spam.
