# 2. Naive Bayes' Algorithm for Classification on Pima Indians Diabetes Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB



## Load the data from csv file

In [2]:
pima_df = pd.read_csv("data/diabetes.csv")

In [3]:
pima_df.head()

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True


In [4]:
print(pima_df.shape)

(768, 9)


In [5]:
outcome = pima_df.outcome
features = pima_df.drop('outcome',axis=1)

## Split it into training and test datasets

In [6]:
x_train, x_test, y_train, y_test = train_test_split(features, outcome, test_size=0.2, random_state=0)

## Summarizing the training data

In [7]:
col_names = list(pima_df.columns.values)
col_names.pop(col_names.index('outcome'))
print(col_names)

['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi', 'diabetespedigreefunction', 'age']


In [8]:
for i in col_names:
    print("No. of missing rows in {0}: {1}".format(i, len(pima_df.loc[pima_df[i ] == 0])))

No. of missing rows in pregnancies: 111
No. of missing rows in glucose: 5
No. of missing rows in bloodpressure: 35
No. of missing rows in skinthickness: 227
No. of missing rows in insulin: 374
No. of missing rows in bmi: 11
No. of missing rows in diabetespedigreefunction: 0
No. of missing rows in age: 0


## Imputing missing values with mean

In [9]:
from sklearn.preprocessing import Imputer
fill_missing = Imputer(missing_values=0, strategy="mean", axis=0)
x_train = fill_missing.fit_transform(x_train)
x_test = fill_missing.fit_transform(x_test)

## Classify samples from test dataset

In [10]:
clf=GaussianNB()
clf.fit(x_train, y_train)

GaussianNB(priors=None)

In [11]:
y_pred = clf.predict(x_test)

In [12]:
accuracy = accuracy_score(y_test, y_pred, normalize=True)

In [13]:
print(accuracy)

0.7857142857142857


In [14]:
result = pd.DataFrame(y_test)
result.head()

Unnamed: 0,outcome
661,True
122,False
113,False
14,True
529,False


In [15]:
result['pred_outcome'] = y_pred.tolist()
result.head()

Unnamed: 0,outcome,pred_outcome
661,True,True
122,False,False
113,False,False
14,True,True
529,False,False


In [16]:
result.to_csv('output.csv')