# Pima Indian Diabetes Prediction Example


## More about the Dataset

### Importing data

In [4]:
import pandas as pd
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [5]:
print len(df.index) #number of rows in dataset

768


### Reviewing the data

In [16]:
df.isnull() #checking if any null values are present

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False


In [15]:
df.isnull().values.any()

False

In [7]:
df.corr()  
# checking for correlation between columns. Correlated columns add nothing new to the model and 
#increase training time and may make the model biased for one feature/atttribute

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [30]:
# Finding percentage of people with and without diabetes to ensure we have enough samples.
# Rare events are difficult to predict
people_with_diabetes  = float(len(df.loc[df['Outcome']==1]))
people_without_diabetes = float(len(df.loc[df['Outcome']==0]))
print people_with_diabetes*100/len(df.index)
print people_without_diabetes*100/len(df.index)

34.8958333333
65.1041666667


### Splitting data into train and test set 

In [51]:
# 70% training data and #30% test data
from sklearn.cross_validation import train_test_split
test_size =0.3
x = df[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']].values # features for predictions
y = df['Outcome'].values  # outcome for prediction ; whether person is diabetic or not


In [54]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size= test_size, random_state=42)
print x_train.shape,x_test.shape,y_train.shape,y_test.shape

(537, 8) (231, 8) (537,) (231,)


### Reviewing and cleaning data

#### We have already reviewed data for null values, correlation and presence of rare data using functions of Pandas.  Here we use scikit library preprocessing functions to review and perform further enhancements

In [56]:
# Replacing 0's with mean of the columns. 0's are mostly used in numeric data when data is not available. It is a 
# substritution fo null

from sklearn.preprocessing import Imputer 
fill_0 = Imputer(missing_values=0,strategy='mean',axis=0) #axis =0  means mean of columns and not rows
x_train = fill_0.fit_transform(x_train)
x_test = fill_0.fit_transform(x_test) #mean is different for both cases

### Select algorithm and train model
#### Algorithm :- Naive Bayes

In [59]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(x_train,y_train)

GaussianNB(priors=None)

### Performance Evaluation

In [63]:
from sklearn import metrics
# on training data
predictions_on_train = nb_model.predict(x_train)
accuracy_on_train = metrics.accuracy_score(y_train,predictions_on_train)
print accuracy_on_train*100

75.41899441340783


In [64]:
# on testing data
predictions_on_test =  nb_model.predict(x_test)
accuracy_on_test = metrics.accuracy_score(y_test,predictions_on_test)
print accuracy_on_test

0.7359307359307359


In [66]:
# confusion matrix -- on test data
matrix = metrics.confusion_matrix(y_test,predictions_on_test)
print matrix

[[118  33]
 [ 28  52]]


In [67]:
# classification report 
report = metrics.classification_report(y_test,predictions_on_test)
print report

             precision    recall  f1-score   support

          0       0.81      0.78      0.79       151
          1       0.61      0.65      0.63        80

avg / total       0.74      0.74      0.74       231



#### Algorithm :- Logistic Regression