# Homework 1: Classification With Naive Bayes

In [57]:
import numpy as np
import pandas as pd
from scipy.stats import norm


In [49]:
#p1
#Read the data using pandas. Skip first row. There are a total of 767 data-points.
pid_df = pd.read_csv("pima-indians-diabetes.csv", skiprows = [0], names = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Class"])

#General classifier class.
class Classifier(object):
    def __init__(self, train_df, test_df):
        self.train_df = train_df
        self.test_df = test_df
    def train(self):
        pass
    def predict(self):
        pass 

In [119]:
#p1
#naive Bayes classifier
class NBC(Classifier):
    def __init__(self, df, missing = False):
        super().__init__(*self.split(df))
        self.missing = missing
        
    def train(self):
        #calculate the prior probabilities.
        p_prior = {}
        for index, example in self.train_df.iterrows():
            p_prior[example["Class"]] = p_prior.get(example["Class"], 0) + 1
        assert sum(p_prior.values()) == len(self.train_df), "failed to compute the prior"
        for key in p_prior:
            p_prior[key] = p_prior[key]/len(self.train_df)
        self.p_prior = p_prior
        
        #calculate the likelihood normal distributions.
        p_likelihood = {}
        for col in list(self.train_df)[:-1]:
            p_likelihood[col] = {}
            for class_value in p_prior:
                values = self.train_df.loc[self.train_df["Class"] == class_value, col].values
                mean, std = np.mean(values), np.std(values)
                p_likelihood[col][class_value] = norm(mean, std)
        self.p_likelihood = p_likelihood
        return self
    
    def predict(self):
        self.res = {}
        for index, example in self.test_df.iterrows():
            log_p_predict = {class_value:np.log(self.p_prior[class_value]) for class_value in self.p_prior}
            for col in list(self.train_df)[:-1]:
                for class_value in self.p_prior:
                    log_p_predict[class_value] += np.log(self.p_likelihood[col][class_value].pdf(example[col]))
            self.res[index] = max(log_p_predict, key=log_p_predict.get)
        return self
    
    def get_acc(self):
        cnt = 0 
        for index, example in self.test_df.iterrows():
            cnt += 1 if example["Class"] == self.res[index] else 0
        return cnt/len(self.test_df)
                        
    # split the data, 20% for testing, rest for training.
    def split(self, df):
        msk = np.random.rand(len(df)) < 0.8
        train = df[msk]
        test = df[~msk]
        return train, test

#p1.a
#calculate the average over 10 splits
num_splits = 10
print(sum([NBC(pid_df).train().predict().get_acc() for _ in range(num_splits)])/num_splits)

#p1.b
num_splits = 10
print(sum([NBC(pid_df, missing = True).train().predict().get_acc() for _ in range(num_splits)])/num_splits)

0.7483830107762113


In [108]:
#p1.b


Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age
Class
