In [367]:
import pandas as pd
import numpy as np
from scipy.stats import norm

## Read data

In [368]:
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class']
features = column_names[:8]
df = pd.read_csv('pima-indians-diabetes.csv', names=column_names)

In [369]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Util functions to split data

In [370]:
def splitData(df, train_test_ratio):
    train_set = df.sample(frac=train_test_ratio)
    test_set = df.sample(frac=(1 - train_test_ratio))
    return train_set, test_set

In [380]:
train_set, test_set = splitData(df, 0.8)
assert test_set.shape[0] + train_set.shape[0] == df.shape[0]

In [381]:
print(train_set.shape)
train_set.head()

(614, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
283,7,161,86,0,0,30.4,0.165,47,1
145,0,102,75,23,0,0.0,0.572,21,0
531,0,107,76,0,0,45.3,0.686,24,0
148,5,147,78,0,0,33.7,0.218,65,0
757,0,123,72,0,0,36.3,0.258,52,1


In [382]:
print(test_set.shape)
test_set.head()

(154, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
545,8,186,90,35,225,34.5,0.423,37,1
91,4,123,80,15,176,32.0,0.443,34,0
709,2,93,64,32,160,38.0,0.674,23,1
477,7,114,76,17,110,23.8,0.466,31,0
701,6,125,78,31,0,27.6,0.565,49,1


In [383]:
value_counts = train_set.Class.value_counts(normalize=True)
p_labels = value_counts
labels = value_counts.index

## Calculate

In [384]:
# for each class, each feature, calculate mean and variance
class_feature_classifier = pd.DataFrame(columns=['Class', 'Feature', 'Mean', 'Var'])
i = 0
for label in labels:
    each_class_df = train_set[train_set['Class']==label]
    each_class_df = each_class_df.drop(labels='Class', axis=1) 
    for column in each_class_df:
        feature_data = each_class_df[column]
        each_feature_mean = np.mean(feature_data)
        each_feature_var = np.var(feature_data)
        class_feature_classifier.loc[i] = [label, column, each_feature_mean, each_feature_var]
        i = i + 1
        
class_feature_classifier
    

Unnamed: 0,Class,Feature,Mean,Var
0,0,Pregnancies,3.254499,9.099755
1,0,Glucose,108.922879,651.839811
2,0,BloodPressure,67.8509,320.934067
3,0,SkinThickness,19.372751,222.341777
4,0,Insulin,68.167095,8568.617323
5,0,BMI,29.883033,62.346756
6,0,DiabetesPedigreeFunction,0.422072,0.086701
7,0,Age,30.832905,127.93866
8,1,Pregnancies,4.804444,13.428425
9,1,Glucose,142.102222,1082.616217


## Pridict

In [385]:
def get_mean_var(df, klass, feature):
    row = df[(df['Class']==klass) & (df['Feature']==feature)]
    return row['Mean'], row['Var']

# for each class, get the log p(class|feature_vec) value and return the max
def predict(classifier, classes, feature_vec):
    probabilities = {}
    for klass in classes:
        log_sum = 0
        for i, feature in enumerate(feature_vec, start=0):
            mean, var = get_mean_var(classifier, klass, features[i])
            log_sum = log_sum + np.log(norm.pdf(feature, mean , np.sqrt(var)))[0]
        log_sum = log_sum + np.log(p_labels[klass])
        probabilities[klass] = log_sum
    if (probabilities[0] > probabilities[1]):
        return 0
    return 1

## Test

In [386]:
predicts = test_set.apply(lambda x:  predict(class_feature_classifier, labels, x[:8]), axis=1)

## Evaluate

In [387]:
def calculateAccuracy(actual, predicts):
    TP = 0
    num_total = len(actual)
    for i in range(num_total):
        if actual[i] == predicts[i]:
            TP = TP + 1
    return TP/num_total

In [388]:
calculateAccuracy(test_set.Class.tolist(), predicts.tolist())

0.7792207792207793