In [2]:
# Importing Dataset

import pandas as pd

messages = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])
# '\t' -> denotes tab and the label and the msg in the dataset are separated by a tab

In [6]:
# Data Cleaning and Preprocessing

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lm = WordNetLemmatizer()
corpus = []

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lm.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
# Creating the Bag of Words Model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [12]:
# Creating labels by assigning dummies

Y = pd.get_dummies(messages['label'])
Y = Y.iloc[:,1].values # represents ham/spam in a single column => ham -> 0; spam -> 1

X -> Indepedent Feature
Y -> Dependent/ Labels

In [13]:
# Train - Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)
# Test State ->  20% of the data

In [19]:
# Training Model using Naive Bayes Classifier

from sklearn.naive_bayes import MultinomialNB # -> Naive Bayes Library
spam_detect_model = MultinomialNB().fit(X_train, Y_train)

In [21]:
# For Testing

Y_pred = spam_detect_model.predict(X_test)

In [23]:
# Creating Confusion Matrix

from sklearn.metrics import confusion_matrix
confusion_mat = confusion_matrix(Y_test, Y_pred)
print(confusion_mat)

[[936  19]
 [  7 153]]


In [24]:
# Checking Accuracy

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, Y_pred)
print(accuracy)

0.9766816143497757
