In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm

## Read data

In [2]:
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class']
features = column_names[:8]
df = pd.read_csv('data/pima-indians-diabetes.csv', names=column_names)

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Util functions to split data

In [4]:
def splitData(df, train_test_ratio):
    train_set = df.sample(frac=train_test_ratio)
    test_set = df.sample(frac=(1 - train_test_ratio))
    return train_set, test_set

In [5]:
train_set, test_set = splitData(df, 0.8)
assert test_set.shape[0] + train_set.shape[0] == df.shape[0]

In [6]:
print(train_set.shape)
train_set.head()

(614, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
76,7,62,78,0,0,32.6,0.391,41,0
455,14,175,62,30,0,33.6,0.212,38,1
0,6,148,72,35,0,33.6,0.627,50,1
347,3,116,0,0,0,23.5,0.187,23,0
673,3,123,100,35,240,57.3,0.88,22,0


In [7]:
print(test_set.shape)
test_set.head()

(154, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
62,5,44,62,0,0,25.0,0.587,36,0
201,1,138,82,0,0,40.1,0.236,28,0
682,0,95,64,39,105,44.6,0.366,22,0
34,10,122,78,31,0,27.6,0.512,45,0
675,6,195,70,0,0,30.9,0.328,31,1


## Get labels from data set

In [8]:
def get_class_probobilities_and_names(train_set):
    value_counts = train_set.Class.value_counts(normalize=True)
    p_labels = value_counts
    labels = value_counts.index
    return p_labels, labels

## Calculate params

In [9]:
def calculate_mean(data, ignore_missing_value):
    if ignore_missing_value:
        data[data == 0] = np.nan
        mean = np.nanmean(data)
        return mean
    return np.mean(data)

# for each class, each feature, calculate mean and variance
def get_class_feature_summary(train_set, ignore_missing_value):
    summary_df = pd.DataFrame(columns=['Class', 'Feature', 'Mean', 'Var'])
    i = 0
    p_classes, classes = get_class_probobilities_and_names(train_set)
    for label in classes:
        each_class_df = train_set[train_set['Class']==label]
        each_class_df = each_class_df.drop(labels='Class', axis=1) 
        for column in each_class_df:
            feature_data = each_class_df[column]
            each_feature_mean = calculate_mean(feature_data, ((column in ['BloodPressure', 'SkinThickness', 'BMI', 'Age']) and ignore_missing_value))
            each_feature_var = np.var(feature_data)
            summary_df.loc[i] = [label, column, each_feature_mean, each_feature_var]
            i = i + 1
    return summary_df
    

## Pridict

In [17]:
def get_mean_var(df, klass, feature):
    row = df[(df['Class']==klass) & (df['Feature']==feature)]
    return row['Mean'], row['Var']

# for each class, get the log p(class|feature_vec) value and return the max
def predict(class_feature_summary, feature_vec):
    p_classes, classes = get_class_probobilities_and_names(train_set)
    probabilities = {}
    for klass in classes:
        log_sum = 0
        for i, feature in enumerate(feature_vec, start=0):
            mean, var = get_mean_var(class_feature_summary, klass, features[i])
            log_sum = log_sum + np.log(norm.pdf(feature, mean , np.sqrt(var)))[0]
        log_sum = log_sum + np.log(p_classes[klass])
        probabilities[klass] = log_sum
    if (probabilities[0] > probabilities[1]):
        return 0
    return 1

## Evaluate

In [18]:
def calculate_accuracy(actual, predicts):
    TP = 0
    num_total = len(actual)
    for i in range(num_total):
        if actual[i] == predicts[i]:
            TP = TP + 1
    return TP/num_total

In [19]:
def get_accuracy_for_one_iteration(ignore_missing_value):
    test_set, test_set = splitData(df, 0.8)
    summary = get_class_feature_summary(train_set, ignore_missing_value)
    predicts = test_set.apply(lambda x: predict(summary, x[:8]), axis=1)
    accuracy = calculate_accuracy(test_set.Class.tolist(), predicts.tolist())
    return accuracy

def get_avg_accuracy(iteration, ignore_missing_value):
    avg_accuracy = 0
    for i in range(iteration):
        print(f'Itr {i + 1}')
        accuracy = get_accuracy_for_one_iteration(ignore_missing_value)
        print(f"accuracy: {accuracy}")
        avg_accuracy = (avg_accuracy * i + accuracy)/(i+1)
        print(f"avg_accuracy: {avg_accuracy}")
        print("\n")
    return avg_accuracy


## Run 10 times and calculate average accuracy (with missing values)

In [20]:
avg_accuracy1a = get_avg_accuracy(10, ignore_missing_value=False)

Itr 1
-29.13926831157384
-29.978508042087775
-34.3866992372799
-31.077650377391713
-27.906971146140872
-28.449171286084752
-39.25032240013563
-33.463036794939285
-29.739134118938043
-31.249099189565428
-29.38809274839859
-32.00414034228044
-28.531904426950838
-28.656269515032115
-36.24126086759398
-33.81107558057339
-27.11805085885706
-28.068835318530667
-28.34628834449835
-29.461157299009386
-26.752404378777165
-29.152376455350886
-33.76698621939018
-32.413054770530486
-27.247085140066364
-31.383853552913607
-31.20769447228655
-30.078079736965822
-28.46728072539207
-28.301251444934543
-26.250095936566705
-28.578580112121248
-33.813171167463985
-31.934641068541808
-29.010053087042706
-29.77375033296056
-48.06103333132931
-39.737189844327474
-26.762465100618396
-28.72311736576244
-26.636839686209118
-30.67070687439304
-31.261852287738602
-28.384937738524417
-27.207564195889088
-29.062231867243185
-28.575912782632503
-28.166217322512342
-30.718179171590776
-29.016468579823677
-42.9370579

-31.44713753421206
-33.081951354303015
-36.09648530108071
-30.905081550589014
-27.551472155486316
-29.021829122325304
-28.65193451873548
-28.21351734868085
-27.928735853286014
-29.245366083170758
-31.472444878911713
-31.23036619768092
-28.32376626522254
-27.18815265221268
-26.524897429417926
-26.713483214477552
-44.08145503373953
-46.394917475555076
-53.66497677692393
-40.710552122492615
-28.547303474140747
-28.493654923588647
-27.83470879399712
-28.16617784515582
-25.641655510289393
-27.847139466962155
-28.03502082785505
-32.53954012456516
-27.998967580063603
-32.20760467436264
-30.675543973447404
-29.121641496542328
-26.95145940434207
-29.69798822818248
-34.195536687037986
-35.777784874065254
-29.59277221064866
-28.546147093237295
-28.770758615585848
-30.399292217088107
-27.155696022907666
-30.488194415961626
-34.394598569633644
-34.832420495290364
-26.166099808425404
-28.523176869570232
-32.681350413936585
-29.537214262737766
-27.516243799772077
-28.67759160698063
-29.11697861627190

-28.148038121760386
-29.952274236004666
-27.5244192988875
-29.341715622234815
-36.09648530108071
-30.905081550589014
-27.34784376936705
-28.488002499919347
-29.287547886013556
-27.495412625272593
-27.21364479876151
-27.99963398861119
-29.555265058823366
-29.164860589628702
-27.34589485897107
-29.773394710636413
-27.63158336610707
-29.4936635850428
-32.14917035829802
-29.30532484939573
-27.702071014812688
-31.756548775476148
-28.251081837958274
-29.84342917963458
-42.93705794448354
-35.89566329440921
-27.2366503558616
-29.212415934533666
-27.31623762049941
-29.136378118037705
-34.3866992372799
-31.077650377391713
-36.873949301973674
-35.89081642489948
-26.29572375267544
-29.629708249278462
-26.461919200890865
-29.94890083657602
-32.4661191721738
-30.16699195738933
-31.59510854829
-31.705803914095934
-28.46728072539207
-28.301251444934543
-29.52630249390585
-29.230274995817588
-37.837783645769775
-35.4683157210756
-31.897709927515734
-30.393449635666155
-27.334230162929046
-31.0433449837

-29.949254065622583
-31.768016565942332
-26.95145940434207
-29.69798822818248
-30.32757330250966
-33.740846561818444
-27.334230162929046
-31.043344983780628
-28.827717612658635
-30.30771339432246
-30.484153607841737
-29.019004200737626
-31.373231111181845
-31.77118440685059
-27.2154830506019
-28.99797465241245
-27.70347368419505
-28.11194230508851
-33.09509122897235
-30.34458457427958
-28.973212558194252
-28.82477896279518
-26.182226718422967
-28.406755127843955
-32.814035161100044
-31.277812499702602
-34.64469423181778
-29.477940555576158
-26.690440283897704
-29.59225337008052
-30.497054614154855
-28.94432850131786
-27.077603439847778
-27.88441884429855
-49.190684777059836
-40.94377086768289
-26.81453715050758
-28.952281863926036
-38.04612466468069
-32.12900140876025
-29.872253437355493
-28.733358202201373
-36.09648530108071
-30.905081550589014
-27.77619096272129
-30.212492585996216
-27.953989064208443
-29.366996195493414
-27.353536508067705
-29.22860322383018
-27.028440097521685
-31.

-31.89200677935941
-30.593821112541193
-27.585754822917394
-30.708906029869432
-27.10904526547508
-30.401861664019073
-37.36399669910481
-31.91044921606198
-28.083766518803223
-30.207699648908513
-26.846213390064715
-30.148904301953944
-27.830974245853945
-28.583858264883858
-35.03669357555914
-37.64585607245257
-27.623095133967542
-28.240411010886575
-26.524897429417926
-26.713483214477552
-26.455239457622184
-29.06605341109579
-27.013392916065836
-29.548863864976607
-26.123277999279455
-29.281933990361328
-36.24126086759398
-33.81107558057339
-51.28542925624438
-39.539662438947616
-27.00827068837633
-28.672733732186142
-27.074822543433665
-28.199885964381803
-27.519012872977857
-28.440319678338607
-30.9605136152219
-28.985389667091876
-28.40240624608698
-28.05971448701057
-27.928735853286014
-29.245366083170758
-28.488921731367576
-32.04221639960346
-26.81453715050758
-28.952281863926036
-26.61217315035223
-30.52648017310757
-31.572191054639006
-28.366552455625463
-38.846241286655676

-26.770525234314647
-28.448689281451145
-40.58463844285917
-42.88995408979177
-27.320361977095303
-30.45864068930688
-26.444388332192705
-28.78373136848244
accuracy: 0.7662337662337663
avg_accuracy: 0.7569573283858997


Itr 8
-29.350835125673903
-28.430196151873034
-27.70347368419505
-28.11194230508851
-25.917665186885962
-28.309671004978743
-27.896680525379498
-29.174237946296987
-28.257647453720125
-28.623375037400784
-27.80089315238896
-28.87448591723655
-32.14917035829802
-29.30532484939573
-26.25739511157721
-27.765047773095553
-28.546813988926377
-31.46897693455899
-26.119335093516447
-28.001723868502367
-29.616209706463593
-29.688363380844457
-42.35961641468742
-34.524080977880644
-36.55794318942395
-32.25044975908503
-26.246173136950908
-29.580966091552515
-27.585754822917394
-30.708906029869432
-28.253003715682336
-28.833111201084854
-26.815713241029677
-28.588239973218588
-27.616945530890472
-29.26865972010935
-28.605255090537096
-33.006061950441456
-33.245938653352944
-28.83

-32.03425016240812
-29.040818532194923
-29.120217746907738
-29.66813765817172
-28.813131137414906
-28.026945071867537
-27.207564195889088
-29.062231867243185
-38.04612466468069
-32.12900140876025
-32.19625034375612
-30.266349926866134
-27.142853023661623
-29.376931481327222
-27.396423820132334
-30.086177602645137
-29.681333273745402
-29.899834556598023
-28.489039309749593
-28.643025877177948
-33.76698621939018
-32.413054770530486
-26.429294262149114
-29.259067633619406
-27.813987770044832
-29.838358675326376
-27.5031050656011
-28.864077462513873
-37.63684705128307
-33.55703232064861
-42.74324899895781
-48.17231245138006
-27.464968630754672
-27.617708298659924
-26.977248645714877
-29.33223504140003
-29.550683978805967
-29.601406354616728
-28.606298976945354
-33.204433541477734
-29.616209706463593
-29.688363380844457
-32.09420204388438
-32.26965043964279
-26.29035038439662
-29.05880777784424
-32.330999264858505
-32.14952400545061
-27.460026888603196
-28.106195208460118
-28.96796755948767

-30.569662412055948
-28.085229896631574
-31.63251962625541
-29.505309938613916
-26.938363423984505
-28.23402983697381
-39.937949617076875
-33.40060788841923
-36.8625997950859
-35.346235232436264
-30.409036601366168
-30.24423569787088
-30.415961306078433
-31.26444797101165
-27.049636687426915
-27.577261970331197
-27.55479232403418
-28.708811257483518
-27.97873914145695
-29.03990622902522
-27.64247914015705
-30.694323334510695
-26.85903753360825
-29.303207568326865
-27.28715886351959
-28.34053787886988
-45.07629537944486
-35.65554915649286
-27.892961273503317
-32.02832719975492
-29.2534940242985
-34.0741121575885
-26.677504029232487
-30.3232062082689
-28.56623672594096
-30.154572211196424
-27.730722116523815
-29.453359377608734
-29.555265058823366
-29.164860589628702
-26.972827492580993
-28.364480688440448
-26.771593294538437
-30.428658226975706
-28.320055557149136
-28.79086224449203
-32.542876479354376
-32.23070337930705
-31.527593873686442
-29.874249845067045
-29.167161568324403
-29.82

## Run 10 times and calculate average accuracy (without missing values)

In [14]:
avg_accuracy1b = get_avg_accuracy(10, ignore_missing_value=True)

Itr 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


accuracy: 0.7012987012987013
avg_accuracy: 0.7012987012987013


Itr 2
accuracy: 0.7597402597402597
avg_accuracy: 0.7305194805194806


Itr 3
accuracy: 0.7337662337662337
avg_accuracy: 0.7316017316017316


Itr 4
accuracy: 0.7142857142857143
avg_accuracy: 0.7272727272727273


Itr 5
accuracy: 0.7142857142857143
avg_accuracy: 0.7246753246753247


Itr 6
accuracy: 0.7597402597402597
avg_accuracy: 0.7305194805194805


Itr 7
accuracy: 0.7792207792207793
avg_accuracy: 0.7374768089053803


Itr 8
accuracy: 0.7402597402597403
avg_accuracy: 0.7378246753246753


Itr 9
accuracy: 0.7077922077922078
avg_accuracy: 0.7344877344877345


Itr 10
accuracy: 0.7597402597402597
avg_accuracy: 0.737012987012987


