In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsClassifier

from sk_modelcurves.learning_curve import draw_learning_curve

import matplotlib.pyplot as plt
%matplotlib inline

### Describe the content of the dataset and its goals
The dataset is made up of all numerical data of a subset of medical data relating to patient diagnosis for diabetes according to WHO criteria.

The goal of the dataset is to try and determine possible feature importance to forecast diabetes for this specific population, females at least 21 years old of Pima Indian heritage.

There are some missing values for some of the data variables.

In [17]:
col_names = ['times_pregnant','glucose_concentration','blood_pressure','skin_fold_thickness','serum_insulin','bmi','pedigree','age','class']
na={"glucose_concentration" : "0", "blood_pressure" : "0", "skin_fold_thickness" : "0", "serum_insulin" : "0", "bmi" : "0"}
pima_data = pd.read_csv('pima-indians-diabetes/pima-indians-diabetes.data', names=col_names, na_values=na)

### Describe the features and formulate a hypothesis on which might be relevant in predicting diabetes

After doing basic data exploration, looking at the correlation values and the counts it seems to me that glucose_concentration and bmi are the most helpful features in predicting diabetes, at least according to this dataset, on the surface you would think that skin_fold_thickness would be closely related to bmi but we have a lot of null values there and insulin would also be related to glucose_concentration but we have even more null values for that feature

In [5]:
for nextColumn in col_names:
    fromTo = 'either' if nextColumn == 'class' else 'from'
    orTo = 'or' if nextColumn == 'class' else 'to'
    print("- {0}: {1} {2} {3} {4} ".format(nextColumn,fromTo,pima_data[nextColumn].min(),orTo,pima_data[nextColumn].max()))

- times_pregnant: from 0 to 17 
- glucose_concentration: from 44.0 to 199.0 
- blood_pressure: from 24.0 to 122.0 
- skin_fold_thickness: from 7.0 to 99.0 
- serum_insulin: from 14.0 to 846.0 
- bmi: from 18.2 to 67.1 
- pedigree: from 0.078 to 2.42 
- age: from 21 to 81 
- class: either 0 or 1 


In [6]:
pima_data.describe()

Unnamed: 0,times_pregnant,glucose_concentration,blood_pressure,skin_fold_thickness,serum_insulin,bmi,pedigree,age,class
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
pima_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 767
Data columns (total 9 columns):
times_pregnant           768 non-null int64
glucose_concentration    763 non-null float64
blood_pressure           733 non-null float64
skin_fold_thickness      541 non-null float64
serum_insulin            394 non-null float64
bmi                      757 non-null float64
pedigree                 768 non-null float64
age                      768 non-null int64
class                    768 non-null int64
dtypes: float64(6), int64(3)
memory usage: 60.0 KB


In [8]:
pima_data.corr().sort('class', ascending=False)

Unnamed: 0,times_pregnant,glucose_concentration,blood_pressure,skin_fold_thickness,serum_insulin,bmi,pedigree,age,class
class,0.221898,0.49465,0.170589,0.259491,0.303454,0.31368,0.173844,0.238356,1.0
glucose_concentration,0.128135,1.0,0.223192,0.228043,0.581186,0.232771,0.137246,0.267136,0.49465
bmi,0.021719,0.232771,0.28923,0.648214,0.22805,1.0,0.155382,0.025841,0.31368
serum_insulin,0.082171,0.581186,0.098272,0.184888,1.0,0.22805,0.130395,0.220261,0.303454
skin_fold_thickness,0.100239,0.228043,0.226839,1.0,0.184888,0.648214,0.115016,0.166816,0.259491
age,0.544341,0.267136,0.330107,0.166816,0.220261,0.025841,0.033561,1.0,0.238356
times_pregnant,1.0,0.128135,0.214178,0.100239,0.082171,0.021719,-0.033523,0.544341,0.221898
pedigree,-0.033523,0.137246,-0.002805,0.115016,0.130395,0.155382,1.0,0.033561,0.173844
blood_pressure,0.214178,0.223192,1.0,0.226839,0.098272,0.28923,-0.002805,0.330107,0.170589


### Describe the missing/NULL values. Decide if you should impute or drop them and justify your choice.

As we can see from info() below we have very FEW missing values in glucose_concentration, blood_pressure and bmi, but we have a significant number of values missing for skin_fold_thickness and serum_insulin

    DROP serum_insulin
Most medical literature seems to describe a direct relationship between insulin levels and glucose concentration in the blood in both type I and type II diabetes, therefore I am going to drop serum_insulin since it most likely will not have significant impact as a predictor

https://vsearch.nlm.nih.gov/vivisimo/cgi-bin/query-meta?v%3Aproject=medlineplus&v%3Asources=medlineplus-bundle&query=Type+2+diabetes

https://www.nlm.nih.gov/medlineplus/ency/article/000305.htm

    IMPUTE skin fold thickness

After taking a quick look at some literature on BMI/SFT measurements as predictors of diabetes in adults (there is a lot more study being conducted for children and adolescents!) I have a weak idea that the verdict is out on which is a more effective predictor, they are both used, sometimes together, so I am choosing to impute skin fold thickness and fill in the missing data with the mean

In [9]:
pima_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 767
Data columns (total 9 columns):
times_pregnant           768 non-null int64
glucose_concentration    763 non-null float64
blood_pressure           733 non-null float64
skin_fold_thickness      541 non-null float64
serum_insulin            394 non-null float64
bmi                      757 non-null float64
pedigree                 768 non-null float64
age                      768 non-null int64
class                    768 non-null int64
dtypes: float64(6), int64(3)
memory usage: 60.0 KB


In [19]:
pima_data.drop('serum_insulin',1, inplace=True)
pima_data
pima_data.replace({'skin_fold_thickness':{np.nan:pima_data['skin_fold_thickness'].mean()}},inplace=True)

In [21]:
pima_data.replace({'blood_pressure':{np.nan:pima_data['blood_pressure'].mean()}},inplace=True)
pima_data.replace({'bmi':{np.nan:pima_data['bmi'].mean()}},inplace=True)

### Come up with a benchmark for the minimum performance that an algorithm should have on this dataset

In [33]:
X = pima_data['class']
y = pima_data.drop('class', axis=1)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=7)

from sklearn.dummy import DummyClassifier
dc = DummyClassifier()
dc.fit(X_train, y_train)
#dc.score(X_test, y_test) #couldnt get this to work, doing it by hand for now
pima_data['class'].value_counts() / pima_data['class'].shape[0]

0    0.651042
1    0.348958
dtype: float64

    Minimum performance is .65 which is as good as just guessing that everyone has diabetes 

### What's the best performance you can get with kNN? Is kNN a good choice for this dataset?

In [36]:
%%timeit
from sklearn.cross_validation import ShuffleSplit
cv = ShuffleSplit(X_train.shape[0], n_iter=10, test_size=0.2, random_state=0)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1, weights='uniform', p=2)
knn.fit(X_train, y_train)
knn.predict(test)

from sklearn.grid_search import GridSearchCV
parameters = {'n_neighbors': range(1,30), 'weights': ['uniform', 'distance'],'p': range(1,4)}
clf = GridSearchCV(knn, parameters, cv=cv)
clf.fit(X_train, y_train)
clf.best_params_

ValueError: Found arrays with inconsistent numbers of samples: [  1 614]

In [34]:
### What's the best performance you can get with Naive Bayes? Is NB a good choice for this dataset?
### What's the best performance you can get with Logistic Regression? Is LR a good choice for this dataset?
### What's the best performance you can get with Random Forest? Is RF a good choice for this dataset?
### If you could only choose one, which classifer from the above that you already ran is best? How do you define best? (hint: could be prediction accuracy, running time, interpretability, etc)