# Get data

In [1]:
import pandas as pd
def get_pima_data():  
    filename = 'pima-indians-diabetes.csv'
    df = pd.read_csv(filename, header=None)
    Y_train = df[8].values
    del df[8]
    X_train = df.values
    return X_train, Y_train

# Naive Bayes

In [2]:
from collections import Counter, defaultdict
import numpy as np

class NaiveBaseClass:
    def calculate_relative_occurences(self, list1):
        no_examples = len(list1)
        ro_dict = dict(Counter(list1))
        for key in ro_dict.keys():
            ro_dict[key] = ro_dict[key] / float(no_examples)
        return ro_dict

    def get_max_value_key(self, dic):
        """Gets the key for the maximum value in a dict."""
        v = np.array(list(dic.values()))
        k = np.array(list(dic.keys()))

        maxima = np.where(v == np.max(v))[0]
        if len(maxima) == 1:
            return k[maxima[0]]
        
    def initialize_nb_dict(self):
        self.nb_dict = {}
        for label in self.labels:
            self.nb_dict[label] = defaultdict(list)

class NaiveBayes(NaiveBaseClass):
    """
    Naive Bayes Classifier:
    It is trained with a 2D-array X (dimensions m,n) and a 1D array Y (dimension 1,n).
    X should have one column per feature (total m) and one row per training example (total n).
    After training a dictionary is filled with the class probabilities per feature.
    """
    def train(self, X, Y):
        
        self.labels = np.unique(Y)
        no_rows, no_cols = np.shape(X)
        self.initialize_nb_dict()
        self.class_probabilities = self.calculate_relative_occurences(Y)

        #fill self.nb_dict with the feature values per class
        for label in self.labels:
            row_indices = np.where(Y == label)[0]
            X_ = X[row_indices, :]
            no_rows_, no_cols_ = np.shape(X_)
            for jj in range(0,no_cols_): #For each feature
                self.nb_dict[label][jj] += list(X_[:,jj])
        #transform the dict which contains lists with all feature values 
        #to a dict with relative feature value occurences per class
        for label in self.labels:
            for jj in range(0,no_cols):
                self.nb_dict[label][jj] = self.calculate_relative_occurences(self.nb_dict[label][jj])

    def classify_single_elem(self, X_elem):
        Y_dict = {}
        for label in self.labels:
            class_probability = self.class_probabilities[label]
            for ii in range(0,len(X_elem)):
                relative_feature_values = self.nb_dict[label][ii]
                if X_elem[ii] in relative_feature_values.keys():
                    class_probability *= relative_feature_values[X_elem[ii]]
                else:
                    class_probability *= 0
            Y_dict[label] = class_probability
        return self.get_max_value_key(Y_dict)
                    
    def classify(self, X):
        self.predicted_Y_values = []
        no_rows, no_cols = np.shape(X)
        for ii in range(0,no_rows):
            X_elem = X[ii,:]
            prediction = self.classify_single_elem(X_elem)
            self.predicted_Y_values.append(prediction)
        return self.predicted_Y_values          

# Train and predict

In [4]:
X_train, Y_train = get_pima_data()
nbc = NaiveBayes()
nbc.train(X_train, Y_train)
preds = nbc.classify(X_train)
print('Accuracy: {0}'.format((preds == Y_train).sum().astype(float) / len(preds)))

Accuracy: 0.9791666666666666


# Notes

Bayes therom:
=>
Denominator is indepedant of C, so it is constant. 
<br>Numerator corresponds to chain rule for each feature or joint probability=>

Naive approach assumes that p(x0/x1..xn,C) = p(x/C), that is each feature is conditionally independant of every other feature. So it can said as


Training: 
* Calculate probabilities for each class versus feature at training
* P(Ck) for all classes
* P(x/Ck) for all values of x for all features for every class
* nb_dict[class][feature] = dict(x_val, prob)
* Use it at inference time
