In [19]:
import pandas as pd
import numpy as np
from scipy.stats import norm

## Read data

In [8]:
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class']
features = column_names[:8]
df = pd.read_csv('data/pima-indians-diabetes.csv', names=column_names)

In [9]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Util functions to split data

In [10]:
def splitData(df, train_test_ratio):
    train_set = df.sample(frac=train_test_ratio)
    test_set = df.sample(frac=(1 - train_test_ratio))
    return train_set, test_set

In [11]:
train_set, test_set = splitData(df, 0.8)
assert test_set.shape[0] + train_set.shape[0] == df.shape[0]

In [12]:
print(train_set.shape)
train_set.head()

(614, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
29,5,117,92,0,0,34.1,0.337,38,0
329,6,105,70,32,68,30.8,0.122,37,0
195,5,158,84,41,210,39.4,0.395,29,1
604,4,183,0,0,0,28.4,0.212,36,1
310,6,80,66,30,0,26.2,0.313,41,0


In [13]:
print(test_set.shape)
test_set.head()

(154, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
48,7,103,66,32,0,39.1,0.344,31,1
409,1,172,68,49,579,42.4,0.702,28,1
306,10,161,68,23,132,25.5,0.326,47,1
537,0,57,60,0,0,21.7,0.735,67,0
714,3,102,74,0,0,29.5,0.121,32,0


## Get labels from data set

In [14]:
def get_class_probobilities_and_names(train_set):
    value_counts = train_set.Class.value_counts(normalize=True)
    p_labels = value_counts
    labels = value_counts.index
    return p_labels, labels

## Calculate params

In [15]:
def calculate_mean(data, ignore_missing_value):
    if ignore_missing_value:
        data[data == 0] = np.nan
        mean = np.nanmean(data)
        return mean
    return np.mean(data)

# for each class, each feature, calculate mean and variance
def get_class_feature_summary(train_set, ignore_missing_value):
    summary_df = pd.DataFrame(columns=['Class', 'Feature', 'Mean', 'Var'])
    i = 0
    p_classes, classes = get_class_probobilities_and_names(train_set)
    for label in classes:
        each_class_df = train_set[train_set['Class']==label]
        each_class_df = each_class_df.drop(labels='Class', axis=1) 
        for column in each_class_df:
            feature_data = each_class_df[column]
            each_feature_mean = calculate_mean(feature_data, ((column in ['BloodPressure', 'SkinThickness', 'BMI', 'Age']) and ignore_missing_value))
            each_feature_var = np.var(feature_data)
            summary_df.loc[i] = [label, column, each_feature_mean, each_feature_var]
            i = i + 1
    return summary_df
    

## Pridict

In [16]:
def get_mean_var(df, klass, feature):
    row = df[(df['Class']==klass) & (df['Feature']==feature)]
    return row['Mean'], row['Var']

# for each class, get the log p(class|feature_vec) value and return the max
def predict(class_feature_summary, feature_vec):
    p_classes, classes = get_class_probobilities_and_names(train_set)
    probabilities = {}
    for klass in classes:
        log_sum = 0
        for i, feature in enumerate(feature_vec, start=0):
            mean, var = get_mean_var(class_feature_summary, klass, features[i])
            log_sum = log_sum + np.log(norm.pdf(feature, mean , np.sqrt(var)))[0]
        log_sum = log_sum + np.log(p_classes[klass])
        probabilities[klass] = log_sum
    if (probabilities[0] > probabilities[1]):
        return 0
    return 1

## Evaluate

In [17]:
def calculate_accuracy(actual, predicts):
    TP = 0
    num_total = len(actual)
    for i in range(num_total):
        if actual[i] == predicts[i]:
            TP = TP + 1
    return TP/num_total

In [18]:
def get_accuracy_for_one_iteration(ignore_missing_value):
    test_set, test_set = splitData(df, 0.8)
    summary = get_class_feature_summary(train_set, ignore_missing_value)
    predicts = test_set.apply(lambda x: predict(summary, x[:8]), axis=1)
    accuracy = calculate_accuracy(test_set.Class.tolist(), predicts.tolist())
    return accuracy

def get_avg_accuracy(iteration, ignore_missing_value):
    avg_accuracy = 0
    for i in range(iteration):
        print(f'Itr {i + 1}')
        accuracy = get_accuracy_for_one_iteration(ignore_missing_value)
        print(f"accuracy: {accuracy}")
        avg_accuracy = (avg_accuracy * i + accuracy)/(i+1)
        print(f"avg_accuracy: {avg_accuracy}")
        print("\n")
    return avg_accuracy


## Run 10 times and calculate average accuracy (with missing values)

In [21]:
avg_accuracy1a = get_avg_accuracy(10, ignore_missing_value=False)

Itr 1
accuracy: 0.7922077922077922
avg_accuracy: 0.7922077922077922


Itr 2
accuracy: 0.7597402597402597
avg_accuracy: 0.775974025974026


Itr 3
accuracy: 0.7727272727272727
avg_accuracy: 0.774891774891775


Itr 4
accuracy: 0.7467532467532467
avg_accuracy: 0.7678571428571429


Itr 5
accuracy: 0.7857142857142857
avg_accuracy: 0.7714285714285715


Itr 6
accuracy: 0.7532467532467533
avg_accuracy: 0.7683982683982684


Itr 7
accuracy: 0.7857142857142857
avg_accuracy: 0.7708719851576994


Itr 8
accuracy: 0.7532467532467533
avg_accuracy: 0.7686688311688312


Itr 9
accuracy: 0.7402597402597403
avg_accuracy: 0.7655122655122656


Itr 10
accuracy: 0.7662337662337663
avg_accuracy: 0.7655844155844156




## Run 10 times and calculate average accuracy (without missing values)

In [20]:
avg_accuracy1b = get_avg_accuracy(10, ignore_missing_value=True)

Itr 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


accuracy: 0.7467532467532467
avg_accuracy: 0.7467532467532467


Itr 2
accuracy: 0.7337662337662337
avg_accuracy: 0.7402597402597402


Itr 3
accuracy: 0.7597402597402597
avg_accuracy: 0.7467532467532467


Itr 4
accuracy: 0.7467532467532467
avg_accuracy: 0.7467532467532467


Itr 5
accuracy: 0.7207792207792207
avg_accuracy: 0.7415584415584415


Itr 6
accuracy: 0.7857142857142857
avg_accuracy: 0.7489177489177489


Itr 7
accuracy: 0.7012987012987013
avg_accuracy: 0.7421150278293135


Itr 8
accuracy: 0.7792207792207793
avg_accuracy: 0.7467532467532467


Itr 9
accuracy: 0.7467532467532467
avg_accuracy: 0.7467532467532467


Itr 10
accuracy: 0.7792207792207793
avg_accuracy: 0.75


