# Homework 1: Classification With Naive Bayes

In [366]:
import numpy as np
import pandas as pd
from scipy.stats import norm, bernoulli
from cv2 import resize


In [262]:
#p1
#Read the data using pandas. Skip first row. There are a total of 767 data-points.
pid_df = pd.read_csv("pima-indians-diabetes.csv", skiprows = [0], names = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Class"])

#General classifier class.
class Classifier(object):
    def __init__(self, train_df, test_df):
        self.train_df = train_df
        self.test_df = test_df
    def train(self):
        pass
    def predict(self):
        pass 

array([0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,

In [408]:
#p1
#naive Bayes classifier
class NBC(Classifier):
    def __init__(self, df=None, missing = [], train_df = None, test_df = None, distribution="Normal"):
        self.distribution = distribution
        if df is not None:
            super().__init__(*self.split(df))
        else:
            super().__init__(train_df, test_df)
        self.missing = missing
        
    def train(self):
        #calculate the prior probabilities.
        p_prior = {}
        for index, example in self.train_df.iterrows():
            p_prior[example["Class"]] = p_prior.get(example["Class"], 0) + 1
        assert sum(p_prior.values()) == len(self.train_df), "failed to compute the prior"
        for key in p_prior:
            p_prior[key] = p_prior[key]/len(self.train_df)
        self.p_prior = p_prior
        #calculate the likelihood normal distributions mean and std.
        p_likelihood = {}
        for col in list(self.train_df)[:-1]:
            p_likelihood[col] = {}
            for class_value in p_prior:
                values = self.train_df.loc[self.train_df["Class"] == class_value, col].values
                if self.distribution == "Normal":
                    #1.b Ignore 0 values to calculate the mean and std.
                    if col in self.missing:     
                        values = values[values != 0]
                    mean, std = np.mean(values), np.std(values)
                    p_likelihood[col][class_value] = np.array([mean, std]);
                    #Bernoulli
                elif self.distribution == "Bernoulli":
                    p_likelihood[col][class_value] = np.sum(values)/len(values)
        self.p_likelihood = p_likelihood
        return self
    
    # Trying to avoid nested for loop to speed up the computation. Speed up more than 100 times!!!
    def predict(self):
        log_p_posterior = {class_value:np.log(np.array([self.p_prior[class_value] for _, _ in self.test_df.iterrows()])) for class_value in self.p_prior}
        for col in list(self.test_df)[:-1]:
            for class_value in self.p_prior:
                values = self.test_df[col].values
                if self.distribution == "Normal":
                    mean, std = self.p_likelihood[col][class_value]
                    #2 Only compute the pdf with non-zero std
                    if std == 0:
                        continue
                    p = norm.pdf(values, mean, std)
                elif self.distribution == "Bernoulli":
                    p = bernoulli.pmf(values, self.p_likelihood[col][class_value])
                #1.b Set p=1 for 0 feature values in testing examples to calculate the posterior. 
                if col in self.missing:  
                    p = np.where(values != 0,p,1)
                #2 Avoid log 0 p values
                p = np.where(p != 0,p,1)
                log_p_posterior[class_value] += np.log(p)
        i = 0
        self.res = {}
        for index, example in self.test_df.iterrows():
            self.res[index] = max(log_p_posterior, key=lambda x:log_p_posterior[x][i])
            i += 1
        return self
        '''
        for index, example in self.test_df.iterrows():
            log_p_predict = {class_value:np.log(self.p_prior[class_value]) for class_value in self.p_prior}
            for col in list(self.test_df)[:-1]:
                for class_value in self.p_prior:
                    #1.b Ignore 0 values in testing examples to calculate the posterior.
                    if col in self.missing:
                        if example[col] == 0:
                            continue
                    #2 only compute the pdf of existing distribution
                    if self.p_likelihood[col][class_value] != None:
                        p = self.p_likelihood[col][class_value].pdf(example[col])
                        if p != 0:                                           
                            log_p_predict[class_value] += np.log(p)
            self.res[index] = max(log_p_predict, key=log_p_predict.get)
        return self
        '''
    
    def get_acc(self):
        cnt = 0 
        for index, example in self.test_df.iterrows():
            cnt += 1 if example["Class"] == self.res[index] else 0
        return cnt/len(self.test_df)
                        
    # split the data, 20% for testing, rest for training.
    def split(self, df):
        msk = np.random.rand(len(df)) < 0.8
        train = df[msk]
        test = df[~msk]
        return train, test


In [402]:
#p1.a
#calculate the average over 10 splits
print("1.a")
num_splits = 10
print(sum([NBC(df = pid_df).train().predict().get_acc() for _ in range(num_splits)])/num_splits)

#p1.b
print("1.b")
num_splits = 10
print(sum([NBC(df = pid_df, missing = ["BloodPressure", "SkinThickness","BMI", "Age"]).train().predict().get_acc() for _ in range(num_splits)])/num_splits)

1.a
0.7539772027863384
1.b
0.7431450604175607


In [406]:
#p2
#load the data
from mnist import MNIST
mndata = MNIST("./python-mnist/data")
train_images, train_labels = mndata.load_training()
train_images = np.array(train_images)
train_labels = np.array(train_labels).reshape(len(train_labels), 1)
test_images, test_labels = mndata.load_testing()
test_images = np.array(test_images)
test_labels = np.array(test_labels).reshape(len(test_labels), 1)

#thresholding
threshold = 127
train_images = np.where(threshold >= train_images,0,1)
test_images = np.where(threshold >= test_images,0,1)

#stretch and resize
def stretch(i):
    i = i.reshape(28,28)
    r_sum = np.sum(i, axis=1)
    c_sum = np.sum(i, axis=0)
    r_max,r_min = np.argwhere(r_sum>0)[0][0], np.argwhere(r_sum>0)[-1][0]
    c_max,c_min = np.argwhere(c_sum>0)[0][0], np.argwhere(c_sum>0)[-1][0]
    r_center,c_center = (r_max+r_min)//2, (c_max+c_min)//2
    if r_center < 10:
        r_center = 10
    if c_center < 10:
        c_center = 10
    if r_center > 18:
        r_center = 18
    if c_center > 18:
        c_center = 18
    i = i[r_center-10:r_center+10,c_center-10:c_center+10]
    return i.reshape(20*20,)
train_stretched_images = []
test_stretched_images = []
for i in train_images:
    train_stretched_images.append(stretch(i))
for i in test_images:
    test_stretched_images.append(stretch(i))
train_stretched_images = np.array(train_stretched_images)
test_stretched_images = np.array(test_stretched_images)

#create dataframe using images
train_df = pd.DataFrame(np.concatenate((train_images, train_labels), axis=1), columns=list(range(len(train_images[0]))) + ["Class"])
test_df = pd.DataFrame(np.concatenate((test_images, test_labels), axis=1), columns=list(range(len(test_images[0]))) + ["Class"])
train_stretched_df = pd.DataFrame(np.concatenate((train_stretched_images, train_labels), axis=1), columns=list(range(len(train_stretched_images[0]))) + ["Class"])
test_stretched_df = pd.DataFrame(np.concatenate((test_stretched_images, test_labels), axis=1), columns=list(range(len(test_stretched_images[0]))) + ["Class"])

(60000, 400)
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0]]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 

In [407]:
#p2.a
#Use the NBC from problem 1
print("2.a")
print("Normal:")
print(" Untouched:")
print("  test_acc", NBC(train_df = train_df, test_df = test_df, distribution="Normal").train().predict().get_acc())
print("  train_acc", NBC(train_df = train_df, test_df = train_df, distribution="Normal").train().predict().get_acc())
print(" Stretched:")
print("  test_acc", NBC(train_df = train_stretched_df, test_df = test_stretched_df, distribution="Normal").train().predict().get_acc())
print("  train_acc", NBC(train_df = train_stretched_df, test_df = train_stretched_df, distribution="Normal").train().predict().get_acc())

print("Bernoulli:")
print(" Untouched:")
print("  test_acc", NBC(train_df = train_df, test_df = test_df, distribution="Bernoulli").train().predict().get_acc())
print("  train_acc", NBC(train_df = train_df, test_df = train_df, distribution="Bernoulli").train().predict().get_acc())
print(" Stretched:")
print("  test_acc", NBC(train_df = train_stretched_df, test_df = test_stretched_df, distribution="Bernoulli").train().predict().get_acc())
print("  train_acc", NBC(train_df = train_stretched_df, test_df = train_stretched_df, distribution="Bernoulli").train().predict().get_acc())


2.a
Normal:
 Untouched:
  test_acc 0.7779
  train_acc 0.7714
 stretched:
  test_acc 0.7279
  train_acc 0.7203333333333334
Bernoulli:
 Untouched:
  test_acc 0.8384
  train_acc 0.8319166666666666
 stretched:
  test_acc 0.8101
  train_acc 0.7998166666666666


In [334]:
#p2.b

array([-2.30258509])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,

array([[1, 2],
       [2, 2]])