Import libraries needed in this notebook

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

load the dataset

In [None]:
data_diabetes = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
data_diabetes.head()

Overview of attributes

In [None]:
data_diabetes.info()

In [None]:
data_diabetes.describe()

Notice that the attributes - Glucose, BloodPressure, SkinThickness, Insulin and BMI are having a minimum value of 0. This value can not be valid under any circumstance.

Replacing 0 values with NAN

In [None]:
data_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = data_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

In [None]:
data_diabetes.info()

In [None]:
data_diabetes.describe()

Histograms of all the continuous attributes(minus the target variable outcome)

In [None]:
plot = data_diabetes.hist(figsize=(20,20))

We can now replace the np.nan values with the median values of the attributes.

In [None]:
data_diabetes['Glucose'].fillna(data_diabetes['Glucose'].median(), inplace=True)
data_diabetes['Insulin'].fillna(data_diabetes['Insulin'].median(), inplace=True)
data_diabetes['BMI'].fillna(data_diabetes['BMI'].median(), inplace=True)
data_diabetes['BloodPressure'].fillna(data_diabetes['BloodPressure'].median(), inplace=True)
data_diabetes['SkinThickness'].fillna(data_diabetes['SkinThickness'].median(), inplace=True)

In [None]:
data_diabetes.isnull().sum()

Zero and null values have now been imputed

In [None]:
plot= sns.countplot(data=data_diabetes, x='Outcome')

Around 250 patients in the dataset were found to have diabetes while 500 were not.

Correlation between different attributes

In [None]:
plt.figure(figsize=(10,10))
plot= sns.heatmap(data=data_diabetes.corr(), annot=True, cmap='YlOrBr')

### Preparing data for models

Scaling of data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness','Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']

In [None]:
X_data_to_scale = data_diabetes[features]

X_data_scaled = scaler.fit_transform(X_data_to_scale)

X_data_scaled = pd.DataFrame(X_data_scaled)

X_data_scaled.columns = features

In [None]:
X_data_scaled.head()

In [None]:
y = data_diabetes['Outcome']

### Using a logistic regression model

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

In [None]:
logit_model = LogisticRegression()
predictions_logit = cross_val_predict(logit_model, X_data_scaled, y, cv=3)

In [None]:
accuracy_score(y, predictions_logit)

In [None]:
precision_score(y, predictions_logit)

In [None]:
recall_score(y, predictions_logit)

In [None]:
f1_score(y, predictions_logit)

In [None]:
conf_matrix =  confusion_matrix(y, predictions_logit)
plot= sns.heatmap(data=conf_matrix, annot=True)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y, predictions_logit)

### Using a Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest_clf = RandomForestClassifier()
predictions_forest = cross_val_predict(forest_clf, X_data_scaled, y, cv=3)

In [None]:
accuracy_score(y, predictions_forest)

In [None]:
precision_score(y, predictions_forest)

In [None]:
recall_score(y, predictions_forest)

In [None]:
f1_score(y, predictions_forest)

In [None]:
conf_matrix_forest =  confusion_matrix(y, predictions_forest)
plot= sns.heatmap(data=conf_matrix_forest, annot=True)

In [None]:
roc_auc_score(y, predictions_forest)

### Using a k-nearest neighbors classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data_scaled, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
train_scores = []
test_scores = []

for i in range(1, 15):
    knn_clf = KNeighborsClassifier(i)
    knn_clf.fit(X_train, y_train)

    train_scores.append(knn_clf.score(X_train, y_train))
    test_scores.append(knn_clf.score(X_test, y_test))

In [None]:
max_score = max(train_scores)
max_ind = [i for i in range(len(train_scores)) if train_scores[i]==max_score][0]

In [None]:
print('Best knn score is {} when k is {}'.format(max_score, max_ind+1))

In [None]:
max_test_score = max(test_scores)
max_ind_test = [i for i in range(len(test_scores)) if test_scores[i]==max_test_score][0]

In [None]:
print('Best knn score is {} when k is {}'.format(max_test_score, max_ind_test+1))

In [None]:
plt.figure(figsize=(10,6))
p = sns.lineplot(y=train_scores, x=[i for i in range(1,15)], marker='*',label='Train Score')
p = sns.lineplot(y=test_scores,  x=[i for i in range(1,15)],  marker='o',label='Test Score')

We get the best results with a knn classifier when the value of k is set to 11

k-fold cross validation on knn

In [None]:
knn_optimal = KNeighborsClassifier(11)
predictions_knn = cross_val_predict(knn_clf, X_data_scaled, y, cv=3)

In [None]:
accuracy_score(y, predictions_knn)

In [None]:
precision_score(y, predictions_knn)

In [None]:
recall_score(y, predictions_knn)

In [None]:
f1_score(y, predictions_knn)

In [None]:
conf_matrix_knn =  confusion_matrix(y, predictions_knn)
plot= sns.heatmap(data=conf_matrix_knn, annot=True)

In [None]:
roc_auc_score(y, predictions_knn)

### ROC_AOC curve comparison of all the models

In [None]:
from sklearn.metrics import roc_curve

fpr_logit, tpr_logit, thresholds_logit = roc_curve(y, predictions_logit)
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y, predictions_forest)
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y, predictions_knn)

plt.figure(figsize=(10,6))
plt.plot(fpr_logit, tpr_logit, linewidth=2, color='b')
plt.plot(fpr_forest, tpr_forest, linewidth=2, color='g')
plt.plot(fpr_knn, tpr_knn, linewidth=2, color='r')
plt.plot([0,1], [0,1], 'k--')
plt.axis([0,1,0,1])
plt.legend(['Logistic Regression', 'Random Forest', 'KNN'])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')

### Observations

- Surprisingly, our random forest and knn classifiers didn't perform any better than the baseline logistic regression model.

- The three models have similar roc_aoc and f1_scores on cross validation testing.

- KNN has a lower recall score(0.50) meaning that its overall ability of predicting diabetic patients is prety low.

- Performance can be improved marginally by using GridSearchCV to tweak model parameters.

- Also, a Support Vector Classifier can be used and the performance should be similar if not better.