## Naive Bayes EDA and model tweaking

This is a follow up to the Udacity base analysis of the SMS Spam dataset.  Looking into methods of increasing the accuracy of the model.

In [1]:
import numpy
import pandas as pd
import pprint
from collections import Counter
from sklearn.naive_bayes import MultinomialNB

pp = pprint.PrettyPrinter(indent = 4)

In [2]:
# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table('smsspamcollection/SMSSpamCollection',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])

# Output printing out first 5 columns
df.head(25)

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


Storing column names as variables and converting label(outcome) to numeric.

In [3]:
outcome = 'label'
features = 'sms_message'

df[outcome] = df.label.map({'ham':0, 'spam':1})
print(df.shape)
df.head() # returns (rows, columns)

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[features], 
                                                    df[outcome], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


### Functions

In [5]:
def condLower(sentence):
    return ' '.join([i.lower() if i != i.upper() else i for i in sentence.split(' ')])

## Getting CountVectorizer from sklearn

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

### Testing base with all to lower

In [7]:
'''
Solution (Base)
'''
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()
print(count_vector)

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [8]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
predictions_base = naive_bayes.predict(testing_data)

'''
Evaluation metrics
'''

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions_base)))
print('Precision score: ', format(precision_score(y_test, predictions_base)))
print('Recall score: ', format(recall_score(y_test, predictions_base)))
print('F1 score: ', format(f1_score(y_test, predictions_base)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562


### Testing updated with to lower aside from all caps text

In [9]:
# Fit the training data and then return the matrix
count_vector = CountVectorizer(lowercase=False)  ## adjusting for all caps
print(count_vector)

lower_case_documents = []
for i in X_train:
    lower_case_documents.append(condLower(i))

training_data = count_vector.fit_transform(lower_case_documents)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
lower_case_documents = []
for i in X_test:
    lower_case_documents.append(condLower(i))
    
testing_data = count_vector.transform(lower_case_documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [10]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
predictions_all_caps = naive_bayes.predict(testing_data)

'''
Evaluation metrics
'''

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions_all_caps)))
print('Precision score: ', format(precision_score(y_test, predictions_all_caps)))
print('Recall score: ', format(recall_score(y_test, predictions_all_caps)))
print('F1 score: ', format(f1_score(y_test, predictions_all_caps)))

Accuracy score:  0.9877961234745154
Precision score:  0.9666666666666667
Recall score:  0.9405405405405406
F1 score:  0.9534246575342465


### Testing updated with stopwords

In [11]:
# Fit the training data and then return the matrix
count_vector = CountVectorizer(stop_words='english')  ## adjusting for all caps
print(count_vector)

lower_case_documents = []
for i in X_train:
    lower_case_documents.append(condLower(i))

training_data = count_vector.fit_transform(lower_case_documents)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
lower_case_documents = []
for i in X_test:
    lower_case_documents.append(condLower(i))
    
testing_data = count_vector.transform(lower_case_documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [12]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
predictions_stop_words = naive_bayes.predict(testing_data)

'''
Evaluation metrics
'''

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions_stop_words)))
print('Precision score: ', format(precision_score(y_test, predictions_stop_words)))
print('Recall score: ', format(recall_score(y_test, predictions_stop_words)))
print('F1 score: ', format(f1_score(y_test, predictions_stop_words)))

Accuracy score:  0.9877961234745154
Precision score:  0.9615384615384616
Recall score:  0.9459459459459459
F1 score:  0.9536784741144414


### Testing updated with stopwords and all caps

In [19]:
# Fit the training data and then return the matrix
count_vector = CountVectorizer(stop_words='english', lowercase=False)  ## adjusting for all caps
print(count_vector)

lower_case_documents = []
for i in X_train:
    lower_case_documents.append(condLower(i))

training_data = count_vector.fit_transform(lower_case_documents)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
lower_case_documents = []
for i in X_test:
    lower_case_documents.append(condLower(i))
    
testing_data = count_vector.transform(lower_case_documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [18]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
predictions_stop_words = naive_bayes.predict(testing_data)

'''
Evaluation metrics
'''

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions_stop_words)))
print('Precision score: ', format(precision_score(y_test, predictions_stop_words)))
print('Recall score: ', format(recall_score(y_test, predictions_stop_words)))
print('F1 score: ', format(f1_score(y_test, predictions_stop_words)))

Accuracy score:  0.9899497487437185
Precision score:  0.9723756906077348
Recall score:  0.9513513513513514
F1 score:  0.9617486338797814


Combination of preprocessing of using stop words and adjusting for all caps words makes significant improvements of 94 to 98 basis points.

In [17]:
print('Accuracy score improvement: ', format(100*accuracy_score(y_test, predictions_stop_words)-accuracy_score(y_test, predictions_base)))
print('Precision score improvement: ', format(100*precision_score(y_test, predictions_stop_words)-precision_score(y_test, predictions_base)))
print('Recall score improvement: ', format(100*recall_score(y_test, predictions_stop_words)-recall_score(y_test, predictions_base)))
print('F1 score improvement: ', format(100*f1_score(y_test, predictions_stop_words)-f1_score(y_test, predictions_base)))


Accuracy score improvement:  98.0064608758076
Precision score improvement:  96.26550202166733
Recall score improvement:  94.1945945945946
F1 score improvement:  95.21881943193418
