# Build a spam/ham classifier using Naive Bayes

In [1]:
#Library Setup
import os
import re
import string
import datetime
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.feature_extraction.text as text

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import average_precision_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from textblob import Word
from textblob import TextBlob 

%matplotlib inline

In [2]:
#Read in spam dataset containing classification and messages
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
print(df)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


# Data Inspection

In [3]:
df.columns
df.head()
df.describe

<bound method NDFrame.describe of      label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

# Text Pre-processing (Standardize, Stemming and Lemma)

In [4]:
#Initialize stopwords and PorterStemmer object
stop = stopwords.words('english')
porterStemmer = PorterStemmer()

#Convert messages to lower case, reduce words to stem and lemmatize
df['message'] = df['message'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['message'] = df['message'].apply(lambda x: " ".join([porterStemmer.stem(word) for word in x.split()]))
df['message'] = df['message'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df.head()

Unnamed: 0,label,message
0,ham,"go until jurong point, crazy.. avail onli in b..."
1,ham,ok lar... joke wif u oni...
2,spam,free entri in 2 a wkli comp to win fa cup fina...
3,ham,u dun say so earli hor... u c alreadi then say...
4,ham,"nah i don't think he goe to usf, he live aroun..."


# Split model data into test/training datasets

In [5]:
#Generate train and test datasets
x_train, x_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.25)

In [7]:
x_train.head()
x_test.head()
y_train.head()
x_test.head()


4327                       that seem unnecessarili hostil
3844    ye ammae....lif take lot of turn you can onli ...
3126    1st wk free! gr8 tone str8 2 u each wk. txt no...
848     i am in office:)what the matter..msg me now.i ...
3601                            i know you mood off today
Name: message, dtype: object

# TFIDF - Term Frequency IDF

In [8]:
#Initialize encoder object
labelEncoder = preprocessing.LabelEncoder()

#fit train datasets to encoder object
y_train = labelEncoder.fit_transform(y_train)
y_test = labelEncoder.fit_transform(y_test)

In [9]:
#Evalute that y_train and y_test have encoded
y_train
y_test

array([0, 0, 1, ..., 0, 1, 0])

In [10]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vectorizer.fit(df['message'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [11]:
x_train_tfidf = tfidf_vectorizer.transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [12]:
x_train_tfidf.data

array([0.40305838, 0.67302949, 0.27099518, ..., 0.15091332, 0.36781748,
       0.08109435])

In [15]:
#Vectorize words for fitting and fit transform X_train data 
count_vector = CountVectorizer()
x_train_counts = count_vector.fit_transform(x_train)

# Model Training

In [16]:
#define model training function using supervised learning methods based on applying Bayes' theorem with strong (naive) feature independence assumptions.
def train_model(classifier, feature_vector_train, label, feature_vector_test, is_neural_net=False):
    #fit training dataset to classifier
    classifier.fit(feature_vector_train, label)
    #predict labels on the test dataset
    predictions = classifier.predict(feature_vector_test)
    
    return metrics.accuracy_score(predictions, y_test)

# Model Training - Naive Bayes

In [17]:
NBaccuracy = train_model(naive_bayes.MultinomialNB(alpha=0.2), x_train_tfidf, y_train, x_test_tfidf)
print("Naive Bayes Accuracy: ", NBaccuracy)

Naive Bayes Accuracy:  0.9849246231155779
