# Naive Bayes 2

### Import libraries

In [1]:
import pandas as pd
import re
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB 

### Set global variables

In [2]:
global test_set_size, random_state
test_set_size = .3
random_state = 42

### Load data

In [3]:
df = pd.read_csv('data/SMS_spam.tsv', sep='\t', names=['label', 'body_text'], header=None)
df.shape

(5568, 2)

### Examine data

In [4]:
pd.set_option('max_colwidth',200)

In [5]:
df.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


#### Bernoulli Naive Bayes classifier requires target variables to be binary

In [6]:
# Check if target variable is binary or not 
df.label.value_counts()

ham     4822
spam     746
Name: label, dtype: int64

#### Since there are only 2 unique target variable values, it is binary 

## Experiment #1:  Hashing vectorizer with body_text (unchanged)

In [7]:
# Separate dependent and independent variables
X = df['body_text']
y = df['label']

In [8]:
# Instantiate hashing vectorizer object 
hv = HashingVectorizer()

In [9]:
# Tranform body_text into numeric values 
X = hv.fit_transform(X)

In [10]:
# Split into training set and test set
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = test_set_size, random_state = random_state)

### Train Bernoulli Naive Bayes classifier (with hashing vectorization)

In [11]:
model = BernoulliNB()
model.fit(X_train, y_train)
predict = model.predict(X_test)

### Evaluate Bernoulli Naive Bayes model performance (with hashing vectorization)

In [12]:
print('Accuracy on training set: {:.4f}'.format(model.score(X_train, y_train)))
print('Accuracy on test set: {:.4f}'.format(model.score(X_test, y_test)))

Accuracy on training set: 0.8619
Accuracy on test set: 0.8755


## Perform text mining preprocessing (lowercase, remove punctuation, tokenize, remove stopwords, and stem) on body_text

In [13]:
# Define stop words list
stopwords = nltk.corpus.stopwords.words('english')     # All English Stopwords

In [14]:
# Instantiate Porter stemmer
ps = nltk.PorterStemmer()

In [15]:
# Function to clean_text
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    text_2 = ' '.join(word for word in text)
    return text_2

In [16]:
df['body_text_clean'] = df['body_text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.,ive search right word thank breather promis wont take help grant fulfil promis wonder bless time
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18
2,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think goe usf live around though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak treat like aid patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday


## Experiment #2:  Hashing vectorizer with body_text (cleaned)

In [17]:
# Separate dependent and independent variables
X = df['body_text_clean']
y = df['label']

In [18]:
# Instantiate hashing vectorizer object 
# hv = HashingVectorizer()     # No need to do this since we already instantiated it earlier

In [19]:
# Tranform body_text into numeric values 
X = hv.fit_transform(X)

In [20]:
# Split into training set and test set
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = test_set_size, random_state = random_state)

### Train Bernoulli Naive Bayes classifier (with hashing vectorization + clean data)

In [21]:
# model = BernoulliNB()     # No need to do this since we already instantiated it earlier
model.fit(X_train, y_train)
predict = model.predict(X_test)

### Evaluate Bernoulli Naive Bayes model performance (with hashing vectorization + clean data)

In [22]:
print('Accuracy on training set: {:.4f}'.format(model.score(X_train, y_train)))
print('Accuracy on test set: {:.4f}'.format(model.score(X_test, y_test)))

Accuracy on training set: 0.8619
Accuracy on test set: 0.8755


#### No difference in performance between experiments 1 & 2

## Experiment #3:  Count vectorizer (Bag of Words) with body_text (cleaned)

In [23]:
# Separate dependent and independent variables
X = df['body_text_clean']
y = df['label']

In [24]:
# Instantiate count vectorizer (Bag of Words) object 
cv = CountVectorizer()

In [25]:
# Tranform body_text into numeric values 
X = cv.fit_transform(X)

In [26]:
# Split into training set and test set
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = test_set_size, random_state = random_state)

### Train Bernoulli Naive Bayes classifier (with Bag of Words vectorization + clean data)

In [27]:
# model = BernoulliNB()     # No need to do this since we already instantiated it earlier
model.fit(X_train, y_train)
predict = model.predict(X_test)

### Evaluate Bernoulli Naive Bayes model performance (with Bag of Words vectorization + clean data)

In [28]:
print('Accuracy on training set: {:.4f}'.format(model.score(X_train, y_train)))
print('Accuracy on test set: {:.4f}'.format(model.score(X_test, y_test)))

Accuracy on training set: 0.9815
Accuracy on test set: 0.9743


#### >10% improvement in performance in experiment 3 over experiments 1 & 2

## Experiment #4:  TF-IDF vectorizer with body_text (cleaned)

In [29]:
# Separate dependent and independent variables
X = df['body_text_clean']
y = df['label']

In [30]:
# Instantiate TF-IDF vectorizer object 
tfidfv = TfidfVectorizer()

In [31]:
# Tranform body_text into numeric values 
X = tfidfv.fit_transform(X)

In [32]:
# Split into training set and test set
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = test_set_size, random_state = random_state)

### Train Bernoulli Naive Bayes classifier (with TF-IDF vectorization + clean data)

In [33]:
# model = BernoulliNB()     # No need to do this since we already instantiated it earlier
model.fit(X_train, y_train)
predict = model.predict(X_test)

### Evaluate Bernoulli Naive Bayes model performance (with TF-IDF vectorization + clean data)

In [34]:
print('Accuracy on training set: {:.4f}'.format(model.score(X_train, y_train)))
print('Accuracy on test set: {:.4f}'.format(model.score(X_test, y_test)))

Accuracy on training set: 0.9815
Accuracy on test set: 0.9743


#### Same >10% improvement in performance in experiment 4 over experiments 1 & 2.  Equivalent performance in experiments 3 & 4 