In [2]:
import pandas as pd
import numpy as np
from scipy.stats import norm

## Read data

In [3]:
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class']
features = column_names[:8]
df = pd.read_csv('pima-indians-diabetes.csv', names=column_names)

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Util functions to split data

In [18]:
def splitData(df, train_test_ratio):
    train_set = df.sample(frac=train_test_ratio)
    test_set = df.sample(frac=(1 - train_test_ratio))
    return train_set, test_set

In [19]:
train_set, test_set = splitData(df, 0.8)
assert test_set.shape[0] + train_set.shape[0] == df.shape[0]

In [20]:
print(train_set.shape)
train_set.head()

(614, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
329,6,105,70,32,68,30.8,0.122,37,0
164,0,131,88,0,0,31.6,0.743,32,1
554,1,84,64,23,115,36.9,0.471,28,0
228,4,197,70,39,744,36.7,2.329,31,0
240,1,91,64,24,0,29.2,0.192,21,0


In [21]:
print(test_set.shape)
test_set.head()

(154, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
487,0,173,78,32,265,46.5,1.159,58,0
532,1,86,66,52,65,41.3,0.917,29,0
441,2,83,66,23,50,32.2,0.497,22,0
678,3,121,52,0,0,36.0,0.127,25,1
141,5,106,82,30,0,39.5,0.286,38,0


## Get labels from data set

In [22]:
def get_class_probobilities_and_names(train_set):
    value_counts = train_set.Class.value_counts(normalize=True)
    p_labels = value_counts
    labels = value_counts.index
    return p_labels, labels

## Calculate

In [23]:
# for each class, each feature, calculate mean and variance
def get_class_feature_summary(train_set):
    summary_df = pd.DataFrame(columns=['Class', 'Feature', 'Mean', 'Var'])
    i = 0
    for label in labels:
        each_class_df = train_set[train_set['Class']==label]
        each_class_df = each_class_df.drop(labels='Class', axis=1) 
        for column in each_class_df:
            feature_data = each_class_df[column]
            each_feature_mean = np.mean(feature_data)
            each_feature_var = np.var(feature_data)
            summary_df.loc[i] = [label, column, each_feature_mean, each_feature_var]
            i = i + 1
    return summary_df
    

## Pridict

In [24]:
def get_mean_var(df, klass, feature):
    row = df[(df['Class']==klass) & (df['Feature']==feature)]
    return row['Mean'], row['Var']

# for each class, get the log p(class|feature_vec) value and return the max
def predict(class_feature_summary, feature_vec):
    p_classes, classes = get_class_probobilities_and_names(train_set)
    probabilities = {}
    for klass in classes:
        log_sum = 0
        for i, feature in enumerate(feature_vec, start=0):
            mean, var = get_mean_var(class_feature_summary, klass, features[i])
            log_sum = log_sum + np.log(norm.pdf(feature, mean , np.sqrt(var)))[0]
        log_sum = log_sum + np.log(p_labels[klass])
        probabilities[klass] = log_sum
    if (probabilities[0] > probabilities[1]):
        return 0
    return 1

## Evaluate

In [25]:
def calculateAccuracy(actual, predicts):
    TP = 0
    num_total = len(actual)
    for i in range(num_total):
        if actual[i] == predicts[i]:
            TP = TP + 1
    return TP/num_total

## Run 10 times and calculate average accuracy

In [30]:
avg_accuracy = 0
for i in range(10):
    print(f'Itr {i + 1}')
    test_set, test_set = splitData(df, 0.8)
    summary = get_class_feature_summary(train_set)
    predicts = test_set.apply(lambda x:  predict(summary, x[:8]), axis=1)
    accuracy = calculateAccuracy(test_set.Class.tolist(), predicts.tolist())
    print(f"accuracy: {accuracy}")
    avg_accuracy = (avg_accuracy * i + accuracy)/(i+1)
    print(f"avg_accuracy: {avg_accuracy}")
    print("\n")
    

Itr 1
accuracy: 0.7012987012987013
avg_accuracy: 0.7012987012987013


Itr 2
accuracy: 0.8376623376623377
avg_accuracy: 0.7694805194805194


Itr 3
accuracy: 0.7207792207792207
avg_accuracy: 0.7532467532467532


Itr 4
accuracy: 0.7922077922077922
avg_accuracy: 0.7629870129870129


Itr 5
accuracy: 0.7727272727272727
avg_accuracy: 0.7649350649350648


Itr 6
accuracy: 0.8246753246753247
avg_accuracy: 0.7748917748917749


Itr 7
accuracy: 0.7532467532467533
avg_accuracy: 0.7717996289424861


Itr 8
accuracy: 0.7012987012987013
avg_accuracy: 0.762987012987013


Itr 9
accuracy: 0.7532467532467533
avg_accuracy: 0.761904761904762


Itr 10
accuracy: 0.7597402597402597
avg_accuracy: 0.7616883116883117


