# Spam Detection using Naive Bayes Algorithm and Bag of Words encoding

In [67]:
import pandas as pd
import numpy as np

In [74]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [77]:
# Convert the given label into numerical format
df["is_spam"] = df.Category.apply(lambda x: 1 if x == "ham" else 0)

In [79]:
df.head()

Unnamed: 0,Category,Message,is_spam
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1


## Prepare the training and test samples

In [85]:
# Split the data into training and testing samples
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.is_spam, test_size = 0.2)
print(X_train.shape)
print(X_test.shape)

(4457,)
(1115,)


In [119]:
# Initialize the count vectorizer to apply the bag-of-words technique
from sklearn.feature_extraction.text import CountVectorizer

cv_obj = CountVectorizer()
X_train_cv = cv_obj.fit_transform(X_train.values)
print(type(X_train_cv))
X_train_np = X_train_cv.toarray()

<class 'scipy.sparse._csr.csr_matrix'>


In [120]:
# Number of sentences which were vectorized with 7740 dimensions which denote all the words in the vocabulary.
X_train_cv.shape

(4457, 7740)

In [141]:
print(X_train_np)
print(X_train[0])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [144]:
np.where(X_train_np[0] != 0)

(array([2383, 3131, 3173, 3763, 4715, 6930, 7476, 7703], dtype=int64),)

In [145]:
from sklearn.naive_bayes import MultinomialNB

In [146]:
# Model is now trained on the vectors created and the labels created by us
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [148]:
X_test_cv = cv_obj.transform(X_test)

In [149]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.92      0.95       154
           1       0.99      1.00      0.99       961

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [153]:
emails = [
    "Hey shivam, can we get tofether to watch football game tomorrow?",
    "Upto 20% discount on car purchases. Hurry up now!!"
]

emails_count = cv_obj.transform(emails)
model.predict(emails_count)

array([1, 1], dtype=int64)

## Easier way for computing the above code

In [154]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("nb", MultinomialNB())])

In [155]:
clf.fit(X_train, y_train)

In [158]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.92      0.95       154
           1       0.99      1.00      0.99       961

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

