In [1]:
import numpy as np

In [143]:
class nb:
    def fit_nb(self, x, y):
        #### note x is a 2d array hence first dimension is no_of tuples and other dimension is no_of features
        #### class variable _classes hold uniques class labels of dataset passed
        ##### no_of_tuples holds no_of_rows passed in dataset
        no_of_tuples,no_of_features = x.shape
        self._classes = np.unique(y)
        no_of_classes = len(self._classes)
        
        ### a 2d array class variable _mean will hold mean values for each feature for respective class in given format 
        #### class mean_of_feature_x1 mean_of_fetaure_x2 ....   ....  .....
        #### 0      m1                m2                 ....   ....  .....
        #### _priors is a vector to hold prior probablity of each class(calculated as no_of_tuples_of_resp_class/total_tuples)
        self._mean = np.zeros((no_of_classes,no_of_features),dtype = np.float64)
        self._var = np.zeros((no_of_classes,no_of_features),dtype = np.float64)
        self._priors = np.zeros(no_of_classes,dtype = np.float64)
        
        ##### evaluating mean and variance and prior probs feature values of all features for each unique class
        for c in self._classes:
            ###x_c represents those samples which have class label as c
            x_c = x[y==c]
            self._mean[c,:] = x_c.mean(axis = 0)
            self._var[c,:] = x_c.var(axis = 0)
            self._priors[c] = x_c.shape[0] / float(no_of_tuples) 
    
    #### a function to evaluate classes for the passed 2d testing array
    def pred_nb(self, x_test):
        y_pred = [self.pred_for_one_sample(one_sample) for one_sample in x_test] ################################
        return y_pred
    
    ######NOTE: since our only aim is to evaluate class label hence we will evaluate class by using direct proportionality 
    ##i.e neglect the denominator while evaluating posterior since it is constant for all and we will be evaluating 
    ### log(numerator)  numerator  =  p(x1/y)*p(x2/y)*...*p(xk/y)*p(y) (neglecting denomi as it is constant) hence 
    ### log(nume) = log(p(x1/y)) + p(x2/y) + p(x3/y) + .... + log(p(y))
    
    ##### a helper module to evaluate class label for each tuple 
    def pred_for_one_sample(self, sample_vector):
        posteriors = []
        #### assigning 0,1,2,3,.... for class c1,c2,c3,c4.....
        for id_x,c in enumerate(self._classes):
            #### calculate p(y) or say prior of particular class
            prior = np.log(self._priors[id_x])
            sum_of_conditional_prob_for_each_feature = np.sum(np.log(self._pdf(id_x,sample_vector)))
            posterior = prior + sum_of_conditional_prob_for_each_feature
            posteriors.append(posterior)
            
        #### return class label of that class which has maximum p(y/x) or maximum posterior value
        return self._classes[np.argmax(posteriors)]   ################################
                
    ### probablity density func
    def _pdf(self, class_idx, vect):
        #### mean vector(vector contains mean of all features) for that particular class 
        mean1 = self._mean[class_idx]
        var1 = self._var[class_idx]
        #diff = np.subtract(vect,mean1)
        numerator = np.exp(-1 * np.square(vect-mean1) / (2*var1))
        #numerator = np.exp(np.square(diff) / (2*var1))
        denominator = np.sqrt(2* np.pi *var1)
        #print(vect)
        return numerator/denominator
        

In [144]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [149]:
#### self writtten  functionn for accuracy
def acc(y_ac,y_pr):
    temp = np.sum(y_ac==y_pr)
    return temp/len(y_ac)

In [150]:
data  = pd.read_csv('diabetes.csv', header=None)
data.head()

y1 = data.iloc[:,-1]
x1 = data.iloc[:,0:8]

x_train1,x_test1,y_train1,y_test1 = train_test_split(x,y,test_size=0.2,random_state=123)



In [147]:
naiveb = nb()
naiveb.fit_nb(x_train1,y_train1)
y_pr1 = naiveb.pred_nb(x_test1)

In [148]:
print(acc(y_test1,y_pr1))

0.965
