## Naive Bayes EDA and model tweaking

This is a follow up to the Udacity base analysis of the SMS Spam dataset.  Looking into methods of increasing the accuracy of the model.

In [1]:
import numpy
import pandas as pd
import pprint
from collections import Counter
from sklearn.naive_bayes import MultinomialNB

pp = pprint.PrettyPrinter(indent = 4)


In [2]:
# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table('smsspamcollection/SMSSpamCollection',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])

# Output printing out first 5 columns
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Storing column names as variables and converting label(outcome) to numeric.

In [3]:
outcome = 'label'
features = ['sms_message']

df[outcome] = df.label.map({'ham':0, 'spam':1})
print(df.shape)
df.head() # returns (rows, columns)

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[features], 
                                                    df[outcome], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393




### Functions

In [5]:
def condLower(sentence):
    return ' '.join([i.lower() if i != i.upper() else i for i in sentence.split(' ')])

## Getting CountVectorizer from sklearn

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()

print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [7]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer(lowercase=False)  ## adjusting for all caps

# Fit the training data and then return the matrix
lower_case_documents = []
for i in X_train:
    lower_case_documents.append(condLower(i))

training_data = count_vector.fit_transform(lower_case_documents)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
lower_case_documents = []
for i in X_test:
    lower_case_documents.append(condLower(i))
    
testing_data = count_vector.transform(lower_case_documents)

In [10]:
training_data

<1x1 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [8]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [1, 4179]

In [None]:
predictions = naive_bayes.predict(testing_data)

In [None]:
'''
Solution
'''
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))