In [193]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
import sys
sys.path.append('../tools')
import tools

# Naive Bayes using Iris Data

In [59]:
iris = datasets.load_iris()

In [60]:
X = iris.data

In [58]:
X[:10,:]

array([[0., 3., 0., 0.],
       [0., 1., 0., 0.],
       [0., 2., 0., 0.],
       [0., 2., 0., 0.],
       [0., 3., 0., 0.],
       [1., 3., 1., 1.],
       [0., 3., 0., 0.],
       [0., 3., 0., 0.],
       [0., 1., 0., 0.],
       [0., 2., 0., 0.]])

In [15]:
y = iris.target

In [7]:
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [8]:
df = pd.DataFrame({"SepalLength":iris.data[:,0],"SepalWidth":iris.data[:,1],"PetalLength":iris.data[:,2],"PetalWidth":iris.data[:,3],"FlowerClass":iris.target})

In [9]:
df.head(10)

Unnamed: 0,FlowerClass,PetalLength,PetalWidth,SepalLength,SepalWidth
0,0,1.4,0.2,5.1,3.5
1,0,1.4,0.2,4.9,3.0
2,0,1.3,0.2,4.7,3.2
3,0,1.5,0.2,4.6,3.1
4,0,1.4,0.2,5.0,3.6
5,0,1.7,0.4,5.4,3.9
6,0,1.4,0.3,4.6,3.4
7,0,1.5,0.2,5.0,3.4
8,0,1.4,0.2,4.4,2.9
9,0,1.5,0.1,4.9,3.1


In [31]:
pd.qcut(X[:,1],4,labels=False)

array([3, 1, 2, 2, 3, 3, 3, 3, 1, 2, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 2, 3, 1, 3, 3, 3, 2, 2, 3, 3, 3, 2, 2, 3, 2, 1, 3, 3, 0, 2, 3,
       3, 1, 3, 2, 3, 2, 2, 2, 2, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 1, 1, 2,
       1, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 3, 2, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 1, 0, 1, 0, 3,
       2, 0, 1, 0, 0, 2, 1, 3, 0, 0, 2, 0, 0, 0, 2, 2, 0, 1, 0, 1, 0, 3,
       0, 0, 0, 1, 3, 2, 1, 2, 2, 2, 0, 2, 2, 1, 0, 1, 3, 1])

In [180]:
class NaiveBayes():
    """
    Naive bayes classifier that utilized the unique property of conditional independence to approximate probabilities
    Parameters:
        X: numpy array() with n x p columns, p generally < n
        y: numpy array() of labels(classes)
        binning: set to true if dealing with continuous variables, where values are binned into four quantiles
        conditionals: dictionary with numpy arrays() of conditional bayesian probs
        priors: dictionary with numpy arrays() of prior probabilities      
        prior_indices: lookup table for the indices where y vals are located
    """
    def __init__(self,X,y,binning=False):

        self.X = X.copy()
        self.y = y.copy()
        self.bins = {}
        if binning == True:
            self.binvals(self.X)
        self.binning = binning
        self.conditionals = {}
        self.priors = {}
        self.prior_indices = {}
        self.get_priors()
    
    def binvals(self,values):
        for i in range(values.shape[1]):
            self.bins[i] = pd.qcut(values[:,i],4,retbins=True)[1]
            values[:,i] = pd.cut(values[:,i], self.bins[i], labels=False, include_lowest=True)
            
    def get_priors(self):
        """
        Prior probs for each class y, P(Y=y)
        """
        uniquevals = np.unique(self.y)
        for i in uniquevals:
            indices = np.where(self.y == i)[0]
            self.prior_indices[i] = indices
            self.priors[i] = indices.shape[0]/self.y.shape[0]
            
    def get_conditionals(self,observation,y_val):
        
        indices = self.prior_indices[y_val]
        size = self.X.shape[0]
        prob = 1
        for i in range(observation.shape[1]):
            values = self.X[indices,i]
            values = np.where(values == observation[:,i])[0]
            prob = prob*((values.shape[0]/size))
        return prob
            
    def predict(self,observations):
        """
            observations must be either vectors of shape 1xp or matrices where rows = observations
        """
        if self.binning == True:
            observations = observations.copy()
            for i in range(observations.shape[1]):
                observations[:,i] = pd.cut(observations[:,i], self.bins[i], labels=False, include_lowest=True)
        if observations.shape[1] != self.X.shape[1]:
            print("Error: observation columns not the same rank as data X")
            return None
        
        prediction = []
        columns = self.X.shape[1]
        for obs in range(observations.shape[0]):
            obs_prediction = []
            for key in list(self.priors.keys()):
                val = self.get_conditionals(observations[obs].reshape(1,columns),key)
                obs_prediction.append(val)
            prediction.append(np.argmax(obs_prediction))
        return np.array(prediction).reshape(len(prediction),)
                
                
                
                
            
        
        
    
        

In [189]:
nb = NaiveBayes(X,y,True)
obs = X.reshape(150,4)

In [190]:
predictions = nb.predict(obs)

In [192]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1])

In [201]:
print("Accuracy: %f" % tools.calc_accuracy(predictions,y))

Accuracy: 0.900000
