# Introduction

This notebook contains code used to build an Text Spam Classifier using a Multinomial Naive Bayes Algorithm.

In [None]:
# Importing important libraries
import numpy as np
import pandas as pd

import re
from collections import defaultdict

from nltk.corpus import stopwords


# Printing files in input folder
import os
print(os.listdir("../input"))

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

# Loading and Cleaning Data

In this section, I load the data used in the model and clean it.

In [None]:
# Loading data from CSV
data = pd.read_csv("../input/spam.csv", encoding = "latin-1")
data.head()

In [None]:
# Selecting and renaming first 2 columns
data = data[['v1','v2']]
data.columns = ['label','text']

In [None]:
# Visual data
data.head()

In [None]:
# Converting ham and spam to 0 and 1 respectively
data['label'] = data['label'].map({'ham':0,'spam':1})

In [None]:
# Printing number of ham and spam emails
data['label'].value_counts()

### Observations:
- There's clearly a lot more spam emails than ham emails
- In the model, I will try to train it on an equal amount of spam and ham emails; in order to reduce bias

In [None]:
# Creating a new shuffled dataset with equal ham and spam emails
ham = data[data['label'] == 0]
spam = data[data['label'] == 1]
new_ham = ham.sample(len(spam), random_state = 5)
new_data = pd.concat([new_ham,spam],axis = 0)
data = shuffle(new_data, random_state = 5).reset_index(drop=True)

# Important Functions

In this section, I define some important functions that will be used in the ML model

In [None]:
# Defining a text parsing function which will tokenize the text. It removes all punctuation, spaces, and stopwords
def textParser(text):
    tokens = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', "", text).split(" ")
    tokens = list(filter(lambda x: len(x) > 0 , map(str.lower,tokens)))
    tokens = list(filter(lambda x: x not in stopwords.words("english"),tokens))
    return tokens

# Building the ML Pipeline

In this section, I build the separate parts of the ML pipeline

## Count Vectorizer
In this section, I use CountVectorizer to tokenize the text (according to the text parser above) and then convert each text into a vectorized format by words and their counts

In [None]:
# Converting each text into a vector format
bow_data = CountVectorizer(analyzer = textParser).fit_transform(data['text'])

# Tfidf Transformation
Since longer texts tend to have more words, I normalize for this by using Tfidf transformer on each text

In [None]:
# Normalizing the vectorized texts by text length
tfidf_data = TfidfTransformer().fit_transform(bow_data)

# Splitting Data
In this section, I will split data into test and training sets

In [None]:
# Splitting the normalized, vectorized texts into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(tfidf_data,data[['label']], test_size=0.3, random_state = 5)

# Defining a Multinomial Model
In this section, I define a Multinomial Model to classify the texts

In [None]:
# Defining a Gaussian model
model = MultinomialNB()

# Fitting the Model

In [None]:
# Fitting the model to the training data
fitted_model = model.fit(X_train.toarray(), np.array(Y_train).ravel())

# Predicting and Evaluating the Model

In [None]:
# Predicting on the test data and printing the accuracy
pred = fitted_model.predict(X_test.toarray())
acc_MNB = accuracy_score(np.array(Y_test).ravel(), pred)
acc_MNB

In [None]:
# Printing the classification report
print(classification_report(np.array(Y_test).ravel(),pred))

# Pipeline

In this section, I create a pipeline that mimics the above models

In [None]:
# Creating the training pipeline
training_pipe = Pipeline(
    steps = [
        ('bow', CountVectorizer(analyzer = textParser)),
        ('tfdif', TfidfTransformer()),
        ('model',MultinomialNB())
    ]
)

In [None]:
# Creating training data from the unvectorized data
X_train, X_test, Y_train, Y_test = train_test_split(data['text'], data['label'], test_size = 0.3, random_state = 5)

In [None]:
# Fitting model and predicting on data
training_pipe.fit(X_train,Y_train)
pred_test_MNB = training_pipe.predict(X_test)
print("Accuracy (%):",training_pipe.score(X_test, Y_test)*100)