# Assignment 4 Naive Bayes Classifier

For this assignment you will implement a Naive Bayes Classifier that implements the SKlearn classifier API with `fit`, `predict` and `score` methods.

The Naive Bayes Classifer takes as parameter the density function used in the likelihood calcuation: 
* `normal`: Normal density function
* `knn`: K nearest neighbor density function

Most of the code already has been written for you. You only need to fill in the missing part between 
```
## Insert your code BEGIN

## Insert your code END
```

In [1]:
from functools import partial

import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from scipy.stats import norm

In [130]:
class NaiveBayesClassifier:
    def __init__(self, likelihood='normal', k=None):
        self.likelihood = likelihood
        
        # Let
        #  K = number of unique classes
        #  N = number of test instances
        #  d = number of inputs (input dimensionality)

        # Numpy array unique classes, shape = (K,)
        self.classes = None
        
        # Numpy array of class priors, P(C), shape = (K,)
        self.priors = None
       
        # Numpy array of likelihoods, P(x|C), shape = (N, K),
        self.likelihoods = None

        # Numpy array of posterior probabilities, P(C|x), shape = (N, K)
        self.posteriors = None
        
        ## For the Guassian Density 
        # means, shape = (K, d)
        self.avgs = None
        # variances, shape = (K, d)
        self.vars = None
        
        ## For the knn Density
        # number of neighbors to use
        self.k = k
        # store training X
        self.X_train = None
        # store trainging y
        self.y_train = None

    
    def generate_classes(self, y):
        """
        Generate the classes based on y, and store in self.classes

        :param y: array of class targets
        """
        self.classes = np.unique(y)
        # print(self.classes)
        
    def generate_priors(self, y):
        """
        Compute the prior probabilities and store self.priors

        :param y: array of class targets
 
        """
        ## Insert your code BEGIN
        unique_classes, class_counts = np.unique(y, return_counts=True)
        # print(class_counts)
        self.priors = class_counts/len(y)
        
        ## Insert your code END
    

    def knn_density_function(self, x_train, x_predict): 
        """
        Implements k-nearest neighbor density estimate (Alpaydin Eq 8.8)

        :param x_train 1d numpy array
        :param x_predict 1d numpy array
        :returns probabilities at x_prdict, shape = x_predict.shape
        """
        # Find the distance to kth nearest neighbor
        result = []
        for x0 in x_predict:
            dist = np.abs(x_train - x0)
            index = np.argsort(dist)
            result.append(dist[index==self.k - 1][0])
        dist_k = np.array(result)
        
        # Find the probability at x using knn density
        # Note: Equation 8.8 may return probabilites greater than 1.
        #       For probabilities greater than 1, set it equal to 1.
        ## Insert your code BEGIN
        probablity = []
        for x in dist_k:
            if x == 0:
                probablity.append(1)
            else:
                prob = self.k / (2* len(x_train)* x)
                probablity.append(min(prob, 1))

        return probablity
        # Return ...
        ## Insert your code END
    
    # Gaussian part
    def generate_avgs(self, X, y):
        """
        Return mean for each class and for each attribute
        """
        ## Insert your code BEGIN
        avgs = []
        for clas in self.classes:
            avgs.append(X[y==clas].mean(axis=0))

        
        avgs = np.array(avgs)
        # avgs = np.transpose(avgs)
        # print("avgs",avgs)
        return avgs
      ## Insert your code END
    
    def generate_vars(self, X, y):
        """
        Return variance for each class and for each attribute
        """
        ## Insert your code BEGIN
        vars = []
        for clas in self.classes:
            vars.append(X[y==clas].var(axis=0))

        vars = np.array(vars)
        # vars = np.transpose(vars)
        # print("vars", np.array(vars))
        return vars
        ## Insert your code END
        
    # ## Insert your code BEGIN
    # # Place any method you need here
    # # def ...

    # ## Insert your code END

    def generate_guassian_likelihoods(self, X):
        ## Insert your code BEGIN
        likelihoods = np.zeros((X.shape[0], len(self.classes)))
        # print(self.avgs.shape, self.vars.shape)
        for i, c in enumerate(self.classes):
            # print(i,c)
            likelihoods[:, i] = np.prod(norm.pdf(X, loc=self.avgs[i], scale=np.sqrt(self.vars[i])), axis=1)              
            # generate the likelihood using norm.pdf
        return likelihoods
        ## Insert your code END

    def generate_knn_likelihoods(self, X):
        
        likelihoods = np.ones([len(self.classes), X.shape[0] ])
        for i, aclass in enumerate(self.classes):
            index = self.y_train == aclass
            for attr in range(X.shape[1]):
                ## Insert your code BEGIN
                x_train = self.X_train[index, attr]
                x_predict = X[:, attr]
                likelihoods[i, :] *= self.knn_density_function(x_train, x_predict)
                ## Insert your code END
        likelihood = (np.transpose(likelihoods))
        return likelihood
    
    def fit(self, X, y):
        # define the classes with ascending order
        self.generate_classes(y)
        # compute the Priori probability
        self.generate_priors(y)
        
        # different likelihood function
        if self.likelihood == 'normal':
            # calculate the avg and var based on X and y
            self.avgs = self.generate_avgs(X, y)
            self.vars = self.generate_vars(X, y)
        elif self.likelihood == 'knn':
            self.X_train = X
            self.y_train = y
        else:
            raise ValueError('Invalid value for likelihood. Must be "normal" or "knn".')
        return self

    def generate_likelihoods(self, X):
        """
        :param ndarray x 
        :returns probabilities at X (like X.shape[0] * Number of classes -> {Poss for each class} )
        """
        # Gussian
        if self.likelihood == "normal":
            self.likelihoods = self.generate_guassian_likelihoods(X)
        elif self.likelihood == "knn":
            self.likelihoods = self.generate_knn_likelihoods(X)
        else:
            raise ValueError('Invalid value for likelihood Must be "normal" or "knn".')
        return self.likelihoods

    def predict(self, X):
        """
        :param ndarray x 
        :returns prediction
        """
        # prediction = np.zeros((X.shape[0], len(self.classes)))
        self.likelihoods = self.generate_likelihoods(X)
        ## Insert your code BEGIN
        # print(self.likelihoods.shape, self.priors.shape)
        self.posteriors = self.likelihoods * self.priors
        # print(self.posteriors.shape)
        self.posteriors /= self.posteriors.sum(axis=1, keepdims=True)
        prediction =  self.classes[np.argmax(self.posteriors, axis=1)]
        ## Insert your code END
        # print("prediction", prediction)
        return prediction

    def score(self, X, y, sample_weight=None):
        return accuracy_score(self.predict(X), y, sample_weight=sample_weight)

In [131]:
iris = load_iris()
x = iris['data']
y = iris['target']

In [132]:
print(x.shape)
clf = NaiveBayesClassifier(likelihood='normal')
# clf.generate_classes(y)
# clf.generate_priors(y)
# clf.generate_avgs( x, y)
# clf.generate_vars(x, y)
clf.fit(x, y)
clf.generate_guassian_likelihoods(x)
accuracy = clf.score(x, y)
print(accuracy)
print(len(clf.avgs[0]))

(150, 4)
0.96
4


In [133]:
# Create an instance of the classifier with a normal likelihood distribution
clf = NaiveBayesClassifier(likelihood='normal')

# # Fit the classifier to the training data
clf.fit(x, y)

# # Use the classifier to make predictions on new data
y_pred = clf.predict(x)

# # Evaluate the accuracy of the classifier
accuracy = clf.score(x, y)
print('Accuracy:', accuracy)

Accuracy: 0.96


In [134]:
np.set_printoptions(precision=3)

print("\nmeans:\n", clf.avgs)

print("\nvariances:\n", clf.vars)

print('\nprior probability:\n', clf.priors)

print('\nlikelihoods:')
print(clf.likelihoods[:5, :])
print(clf.likelihoods[50:55, :])
print(clf.likelihoods[100:105, :])

print('\nposteriors:')
print(clf.posteriors[:5, :])
print(clf.posteriors[50:55, :])
print(clf.posteriors[100:105, :])

print('\npredictions:')
print(y_pred[:5])
print(y_pred[50:55])
print(y_pred[100:105])


means:
 [[5.006 3.428 1.462 0.246]
 [5.936 2.77  4.26  1.326]
 [6.588 2.974 5.552 2.026]]

variances:
 [[0.122 0.141 0.03  0.011]
 [0.261 0.097 0.216 0.038]
 [0.396 0.102 0.298 0.074]]

prior probability:
 [0.333 0.333 0.333]

likelihoods:
[[8.682e+00 1.179e-17 6.175e-25]
 [4.569e+00 6.922e-17 1.073e-24]
 [3.554e+00 3.813e-18 8.316e-26]
 [3.312e+00 4.857e-17 9.788e-25]
 [8.255e+00 3.742e-18 2.381e-25]]
[[1.830e-110 4.579e-002 1.116e-002]
 [2.020e-102 2.625e-001 1.523e-002]
 [1.804e-124 4.397e-002 5.242e-002]
 [1.401e-071 3.287e-001 1.027e-005]
 [5.307e-108 4.892e-001 2.443e-002]]
[[2.292e-255 4.506e-012 7.093e-002]
 [1.876e-153 4.389e-003 1.711e-001]
 [4.443e-221 8.025e-008 4.779e-001]
 [4.306e-177 1.048e-003 5.246e-001]
 [2.174e-218 1.419e-007 6.156e-001]]

posteriors:
[[1.000e+00 1.358e-18 7.113e-26]
 [1.000e+00 1.515e-17 2.348e-25]
 [1.000e+00 1.073e-18 2.340e-26]
 [1.000e+00 1.466e-17 2.955e-25]
 [1.000e+00 4.533e-19 2.884e-26]]
[[3.214e-109 8.040e-001 1.960e-001]
 [7.273e-102 9.4

In [135]:
# Create an instance of the classifier with a normal likelihood distribution
clf = NaiveBayesClassifier(likelihood='knn', k=3)

# # Fit the classifier to the training data
clf.fit(x, y)

# # Use the classifier to make predictions on new data
y_pred = clf.predict(x)

# # Evaluate the accuracy of the classifier
accuracy = clf.score(x, y)
print('Accuracy:', accuracy)

Accuracy: 0.78


In [136]:
np.set_printoptions(precision=3)

print('prior probability:\n', clf.priors)

print('\nlikelihoods:')
print(clf.likelihoods[:5, :])
print(clf.likelihoods[50:55, :])
print(clf.likelihoods[100:105, :])

print('\nposteriors:')
print(clf.posteriors[:5, :])
print(clf.posteriors[50:55, :])
print(clf.posteriors[100:105, :])

print('\npredictions:')
print(y_pred[:5])
print(y_pred[50:55])
print(y_pred[100:105])

prior probability:
 [0.333 0.333 0.333]

likelihoods:
[[6.750e-03 1.252e-05 1.206e-07]
 [3.375e-03 4.383e-05 2.138e-07]
 [1.012e-03 1.587e-06 2.649e-07]
 [5.625e-05 5.455e-06 1.476e-07]
 [1.125e-03 3.287e-06 1.764e-07]]
[[6.328e-07 4.219e-05 1.808e-05]
 [8.766e-07 4.500e-03 1.558e-05]
 [3.800e-07 1.350e-02 2.755e-05]
 [3.429e-07 1.125e-03 3.600e-06]
 [2.639e-07 1.929e-03 1.607e-05]]
[[3.297e-07 5.455e-05 1.620e-03]
 [1.442e-07 4.821e-05 3.375e-03]
 [1.289e-07 7.714e-05 5.625e-04]
 [2.171e-07 1.157e-04 1.157e-03]
 [1.744e-07 7.143e-05 7.418e-06]]

posteriors:
[[9.981e-01 1.852e-03 1.783e-05]
 [9.871e-01 1.282e-02 6.254e-05]
 [9.982e-01 1.565e-03 2.612e-04]
 [9.094e-01 8.819e-02 2.386e-03]
 [9.969e-01 2.913e-03 1.563e-04]]
[[1.039e-02 6.927e-01 2.969e-01]
 [1.941e-04 9.964e-01 3.449e-03]
 [2.809e-05 9.979e-01 2.037e-03]
 [3.037e-04 9.965e-01 3.189e-03]
 [1.357e-04 9.916e-01 8.263e-03]]
[[1.968e-04 3.257e-02 9.672e-01]
 [4.213e-05 1.408e-02 9.859e-01]
 [2.015e-04 1.206e-01 8.792e-01]
 [1.