<font color="green">*To start working on this notebook, or any other notebook that we will use in the Moringa Data Science Course, we will need to save our own copy of it. We can do this by clicking File > Save a Copy in Drive. We will then be able to make edits to our own copy of this notebook.*</font>

# Python Programming: Naive Bayes

## Example 1: Gaussian Naive Bayes Classifier

In [17]:
# Example 1
# ---
# This type of classifier makes the assumption of normal distribution 
# thus can be best used in cases when all our features are continuous.
# ---
# Question: Predict the species of flower using 4 different features.
# ---
# 
#OUR CODE GOES HERE

In [18]:
# Load libraries and datasets to be used in this example
#
from sklearn import datasets
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [19]:
# Loading our data from python datasets
# 
iris = datasets.load_iris()
X = iris.data
y = iris.target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [20]:
# Splitting our data into a training set and a test set
# 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6) 

In [21]:
# Training our model
# 
clf = GaussianNB()  
model = clf.fit(X_train, y_train) 

In [22]:
# Predicting our test predictors
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

0.9333333333333333


In [23]:
# Predicting a new observation
new_observation = [[ 10,  3,  4,  0.4]]

new_prediction = model.predict(new_observation)
new_prediction

array([1])

## Example 2: Multinomial Naive Bayes Classifier

In [24]:
# Example 2
# ---
# While working with the multinomial naive bayes classifier, the features are assumed to be multinomially distributed. 
# This would mean that this type of classifier is commonly used when we have discrete data (e.g. movie ratings 1 and 5).
# Let us see how this works.
# ----
# Question: Build a model to predict whether an sms message is spam or not.
# ---
# Dataset url = http://bit.ly/SpamCollectionDataset
# ---
# 
#OUR CODE GOES HERE

In [25]:
# Importing our libraries 

# Importing pandas
import pandas as pd

# Importing numpy
import numpy as np

# We will also download and import nlkt which is a tokenizer. 
# This library will help us break (messages) into individual linguistic units i.e. words.
#
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ronal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
# Loading and previewing our dataset
# 
df = pd.read_csv('http://bit.ly/SpamCollectionDataset', sep='\t',  header = None, names = ['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
# Pre-processing
# We will first emoving useless variance for our task at hand 
# 

# Converting the labels from strings to binary values for our classifier
# 
df['label'] = df.label.map({'ham': 0, 'spam': 1})

# Converting all characters in the message to lower case
# 
df['message'] = df.message.map(lambda x: x.lower())

# Removing any punctuation
# 
df['message'] = df.message.str.replace('[^\w\s]', '')

  df['message'] = df.message.str.replace('[^\w\s]', '')


In [28]:
df['message'][0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [48]:
df.shape

(5572, 2)

In [29]:
# Pre-processing 
# Tokenizing the messages into into single words using nltk. 

# Applying the tokenization
# 
df['message'] = df['message'].apply(nltk.word_tokenize)

In [30]:
df['message'][0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [31]:
# Fifth, we will perform some word stemming. 
# The idea of stemming is to normalize our text for all variations of words carry the same meaning, 
# regardless of the tense. One of the most popular stemming algorithms is the Porter Stemmer:
# 
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
 
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])

In [32]:
df['message'][0]

['go',
 'until',
 'jurong',
 'point',
 'crazi',
 'avail',
 'onli',
 'in',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amor',
 'wat']

In [33]:
# Finally, we will transform the data into occurrences, 
# which will be the features that we will feed into our model
# 
from sklearn.feature_extraction.text import CountVectorizer

# This converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))

count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['message'])

In [64]:
counts

<5572x8169 sparse matrix of type '<class 'numpy.float64'>'
	with 72500 stored elements in Compressed Sparse Row format>

In [34]:
mess = 'we are going to test how the stemming works in real world in handy handed handle languages, language'


In [35]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
word = word_tokenize(mess)
stemmer = PorterStemmer()
for w in word:

    print(stemmer.stem(w))

we
are
go
to
test
how
the
stem
work
in
real
world
in
handi
hand
handl
languag
,
languag


In [36]:
# We could leave it as the simple word-count per message, but it is better to use Term Frequency Inverse Document Frequency, more known as tf-idf
# 
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)

In [37]:
# Training the Model
# Now that we have performed feature extraction from our data, it is time to build our model. 
# We will start by splitting our data into training and test sets
# 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69)

In [38]:
# Fitting our model 
# Then, all that we have to do is initialize the Naive Bayes Classifier and fit the data. 
# For text classification problems, the Multinomial Naive Bayes Classifier is well-suited
# 
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

In [39]:
# Evaluating the Model
# Once we have put together our classifier, we can evaluate its performance in the testing set
# 
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

0.9480286738351255


In [67]:
# Enter the word to predict
new_word = 'come we test, and test'
model.predict(new_word )

ValueError: Expected 2D array, got scalar array instead:
array=come we test, and test.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [47]:
df['message']

0       go until jurong point crazi avail onli in bugi...
1                                   ok lar joke wif u oni
2       free entri in 2 a wkli comp to win fa cup fina...
3             u dun say so earli hor u c alreadi then say
4       nah i dont think he goe to usf he live around ...
                              ...                        
5567    thi is the 2nd time we have tri 2 contact u u ...
5568                      will ü b go to esplanad fr home
5569         piti wa in mood for that soani other suggest
5570    the guy did some bitch but i act like id be in...
5571                              rofl it true to it name
Name: message, Length: 5572, dtype: object

## Example 3: Bernoulli Naive Bayes Classifier

In [None]:
# Example 3
# ---
# Question: It is rare to get a scenario where you have to use the Bernoulli Naive Bayes Classifier. 
# However, such a case would assume that all our features are binary, 
# that is they take only two values (e.g. a nominal categorical feature that has been one-hot encoded).
# In the following example we will generate a dataset to demonstrate the use of this Classifier.
# ---
# 
#OUR CODE GOES HERE

In [None]:
# Importing our libraries
# 
import numpy as np
from sklearn.naive_bayes import BernoulliNB

In [None]:
# Creating binary features and target data
# 
# Creating three binary features
X = np.random.randint(2, size=(100, 3))

# Creating a binary target vector
y = np.random.randint(2, size=(100, 1)).ravel()

In [None]:
# Viewing first ten observations
# 
X[0:10]

array([[1, 1, 0],
       [0, 1, 1],
       [1, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 1, 1],
       [0, 1, 0],
       [1, 1, 0],
       [1, 0, 1],
       [0, 1, 0]])

In [None]:
# Training our Bernoulli Naive Bayes Classifier
# 
# Creating oour Bernoulli Naive Bayes object with prior probabilities of each class
clf = BernoulliNB()

# Train model
model = clf.fit(X, y)

# model score
model.score(X, y)

0.62

## <font color="green">Challenge 1</font>

In [None]:
# Challenge 1
# ---
# Question: Build a model to determine whether a mushroom is edible.
# ---
# Dataset url = http://bit.ly/MushroomDataset
# 
OUR CODE GOES HERE

## <font color="green">Challenge 2</font> 

In [None]:
# Challenge 2
# ---
# Question: Given the following two datasets, build a model to determine whether a passenger survived or not.
# ---
# Train Dataset url = http://bit.ly/TitanicDatasetTrain
# Test Dataset url = http://bit.ly/TitanicDatasetTest
# ---
# 
OUR CODE GOES HERE

## <font color="green">Challenge 3</font> 

In [None]:
# Challenge 3
# ---
# Question: Build a model to classify a type of glass given the following dataset.
# ---
# Dataset url = http://bit.ly/GlassDatasetB
# Dataset info:
# Type of glass: (class) 
# -) 1 window glass (from vehicle or building) 
# -) 2 not window glass (containers, tableware, or headlamps)
# ---
# 
OUR CODE GOES HERE

## <font color="green">Challenge 4</font> 

In [None]:
# Challenge 4
# ---
# Question: Build a classifier to help determine whether future patients do or do not have heart disease.
# ---
# Dataset url = http://bit.ly/HeartDatasetNB
# 
OUR CODE GOES HERE