# Problem Statement:


## Diabetes Classification

A famous collection of data on whether a patient has diabetes, known as the Pima Indians dataset, and originally owned by the National Institute of Diabetes and Digestive and Kidney Diseases can be found at Kaggle. Download this dataset from https://www.kaggle.com/kumargh/pimaindiansdiabetescsv. This data has a set of attributes of patients, and a categorical variable telling whether the patient is diabetic or not. For several attributes in this data set, a value of 0 may indicate a missing value of the variable. There are a total of 767 data-points.

### Question: Part-A

Build a simple naive Bayes classifier to classify this data set. Use a normal distribution to model each of the class-conditional distributions.

Compute an estimate of the accuracy of the classifier by averaging over 10 test-train splits. Each split should randomly assign 20% of the data to test, and the rest to train.
Write this classifier and the test-train split code from scratch.  Libraries can only be used to load & hold the data.

### Question: Part-B

Now adjust your code so that, for attribute 3 (Diastolic blood pressure), attribute 4 (Triceps skinfold thickness), attribute 6 (Body mass index), and attribute 8 (Age), it regards a value of 0 as a missing value when estimating the class-conditional distributions, and the posterior. 

Compute an estimate of the accuracy of the classifier by averaging over 10 test-train splits.

### Code: Part A

In [4]:
import pandas as pd
import numpy as np
import random
import csv

#Load Dataset 
def loadfile(file):
    lines = csv.reader(open(file))
    datafile = list(lines)
    for i in range(len(datafile)):
        datafile[i] = [float(x) for x in datafile[i]]
    return datafile

#Train Test Split
def train_test_split(datafile, split_ratio):
    train_size = int(len(datafile) * split_ratio)
    train_data = []
    datafile_copy = list(datafile)
    while len(train_data) < train_size:
        index = random.randrange(len(datafile_copy))
        train_data.append(datafile_copy.pop(index))
    return [train_data, datafile_copy]

#Create a dictionary separating class and data
def class_data_dict(datafile):
    class_dict = {}
    for i in range(len(datafile)):
        row = datafile[i]
        if (row[-1] not in class_dict):
            class_dict[row[-1]] = []
        class_dict[row[-1]].append(row)
    return class_dict

#Calculate Mean and Standard Deviation
import math
def mean(values):
    return float(sum(values)/float(len(values)))
 
def stdev(values):
    avg = mean(values)
    sum_of_sq = float (sum([pow(x-avg,2) for x in values]))
    length = float (len(values)-1)
    variance = float (sum_of_sq/length)
    return math.sqrt(variance) 

#mean, standard deviation, and attributes
def condense(datafile):
    condensed_values = [(mean(features), stdev(features)) for features in zip(*datafile)]
    del condensed_values[-1]
    return condensed_values

#Separate class and attribute properties like mean, std 
def condense_class_dict(datafile):
    class_dict = class_data_dict(datafile)
    condense_class = {}
    for class_value, data in class_dict.items():
        condense_class[class_value] = condense(data)
    return condense_class

#calculateProbability
from scipy.stats import norm
import math

def calc_log_probability(x, mean, stdev):
    return np.log(norm.pdf(x, mean, stdev))

def calc_only_class_log_prob(train_data):
    class_dict = class_data_dict(train_data)
    class_count = {}
    total_count = 0
    class_only_log_prob = {}
    for class_value in class_dict.keys():
        class_count[class_value] = len(class_dict[class_value])
    for x in class_count.values():
        total_count += x
    for class_value,count in class_count.items():
        y = count/total_count
        log_prob = np.log(y)
        class_only_log_prob[class_value] = log_prob
    return class_only_log_prob        

#calculateClassProbabilities ***log of probability***
def calc_class_probabilities(condensed_values, row, train_data):
    class_log_probabilities = {} 
    class_only_log_prob = calc_only_class_log_prob(train_data)
    for class_val, attribute_values in condensed_values.items():   
        class_only_prob = class_only_log_prob[class_val]
        class_log_probabilities[class_val] = 0
        for i in range(len(attribute_values)):
            mean, stdev = attribute_values[i]
            x = row[i]
            class_log_probabilities[class_val] += calc_log_probability(x, mean, stdev)
        class_log_probabilities[class_val] += class_only_prob
    return class_log_probabilities

#predict
def class_prediction(condensed_values, row, train_data):
    probabilities = calc_class_probabilities(condensed_values, row, train_data)
    best_class, highest_Prob = None, -1
    for class_val, probability in probabilities.items():
        if best_class is None or probability > highest_Prob:
            highest_Prob = probability
            best_class = class_val
    return best_class

#getPredictions(summaries, testSet):
def predict_test_data(condensed_values, train_data, test_data):
    test_predictions = []
    for i in range(len(test_data)):
        result = class_prediction(condensed_values, test_data[i], train_data)
        test_predictions.append(result)
    return test_predictions

#getAccuracy
def prediction_accuracy(test_data, test_predictions):
    correct = 0
    for x in range(len(test_data)):
        if test_data[x][-1] == test_predictions[x]:
            correct += 1
    return (correct/float(len(test_data)))

def start():
    file = 'pima-indians-diabetes.csv'
    split_ratio = 0.80
    k_fold = 10
    datafile = loadfile(file)
    actual_length = len(datafile)
    datafile_copy = list(datafile)
    accuracy_list = []
    for i in range(k_fold):
        train_data, test_data = train_test_split(datafile_copy, split_ratio)
        #print(f'Split {actual_length} rows into train={len(train_data)} and test={len(test_data)} rows')
        # prepare model
        condense_class = condense_class_dict(train_data)
        # test model
        test_predictions = predict_test_data(condense_class, train_data, test_data)
        accuracy = prediction_accuracy(test_data, test_predictions)
        accuracy_list.append(accuracy)
        print(f'Fold {i+1} accuracy: {accuracy} ')
       
    mean_accuracy = float (sum(accuracy_list)/len(accuracy_list))
    print(f'Mean Accuracy over 10 train-test splits: {mean_accuracy}')
    
start()    

Fold 1 accuracy: 0.7857142857142857 
Fold 2 accuracy: 0.7077922077922078 
Fold 3 accuracy: 0.7532467532467533 
Fold 4 accuracy: 0.7662337662337663 
Fold 5 accuracy: 0.7857142857142857 
Fold 6 accuracy: 0.7402597402597403 
Fold 7 accuracy: 0.7597402597402597 
Fold 8 accuracy: 0.7792207792207793 
Fold 9 accuracy: 0.7597402597402597 
Fold 10 accuracy: 0.7467532467532467 
Mean Accuracy over 10 train-test splits: 0.7584415584415585
