# Predict Whether A Patient Has Diabetes

I will create a model to predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset.

All patients in this dataset are females at least 21 years old of Pima Indian heritage.

In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
print(df.info())
df.describe()

In [None]:
df.head()

# Replace 0 to NaN

The following columns that contains value 0 will be replaced with Null value as there is no way to get 0 in reality:

* Glucose
* BloodPressure
* SkinThickness
* Insulin
* BMI

In [None]:
import numpy as np

df['Glucose'].replace(0, np.nan, inplace=True)
df['BloodPressure'].replace(0, np.nan, inplace=True)
df['SkinThickness'].replace(0, np.nan, inplace=True)
df['Insulin'].replace(0, np.nan, inplace=True)
df['BMI'].replace(0, np.nan, inplace=True)

print(df.info())

# Drop Correlated Feature Columns

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.heatmap(abs(df.corr()))
plt.show()

In [None]:
df.drop(['SkinThickness' ,'Insulin'], axis=1, inplace=True)

sns.heatmap(abs(df.corr()))
plt.show()

# Balance Dataset To Prevent Model Being Biased Toward Dominant Class

In [None]:
df['Outcome'].value_counts()

In [None]:
# Balance dataset
df_outcome_1 = df[df['Outcome'] == 1].copy()
i = len(df_outcome_1)
df_outcome_0 = df[df['Outcome'] == 0].sample(i, random_state=1)
df_balanced = df_outcome_0.append(df_outcome_1)

# Display class value counts
df_balanced['Outcome'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

# Create training and test set
y = df_balanced['Outcome']
X = df_balanced.drop('Outcome', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2
                                                    , stratify=y, random_state=1)

# Select Model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Create pipeline
steps = [('imputer', SimpleImputer())
         , ('scaler', StandardScaler())
         , ('model', LogisticRegression(random_state=1, solver='liblinear'))]
pipeline = Pipeline(steps)

cv_auc = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')
print('LogisticRegression AUC: {:.3f}'.format(np.mean(cv_auc)))

In [None]:
from sklearn.svm import LinearSVC

# Create pipeline
steps = [('imputer', SimpleImputer())
         , ('scaler', StandardScaler())
         , ('model', LinearSVC(random_state=1, dual=False))]
pipeline = Pipeline(steps)

cv_auc = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')
print('LinearSVC AUC: {:.3f}'.format(np.mean(cv_auc)))

In [None]:
from sklearn.svm import SVC

# Create pipeline
steps = [('imputer', SimpleImputer())
         , ('scaler', StandardScaler())
         , ('model', SVC(random_state=1))]
pipeline = Pipeline(steps)

cv_auc = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')
print('SVC AUC: {:.3f}'.format(np.mean(cv_auc)))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create pipeline
steps = [('imputer', SimpleImputer())
         , ('scaler', StandardScaler())
         , ('model', RandomForestClassifier(random_state=1))]
pipeline = Pipeline(steps)

cv_auc = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')
print('RandomForestClassifier AUC: {:.3f}'.format(np.mean(cv_auc)))

Selected LogisticRegression as it has the highest AUC score.

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Create pipeline
steps = [('imputer', SimpleImputer())
         , ('scaler', StandardScaler())
         , ('model', LogisticRegression(random_state=1, solver='liblinear'))]
pipeline = Pipeline(steps)

# Tune hyperparameters on the training set 
param_grid = {'model__penalty': ['l1', 'l2']
              , 'model__C': [0.1, 0.5, 1, 1.5, 2, 2.5]}
searcher_cv = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc')
searcher_cv.fit(X_train, y_train)

# Print the optimal parameters
print("Tuned Parameter: {}".format(searcher_cv.best_params_))
print('Best score: {:.3f}'.format(searcher_cv.best_score_))

best_estimator = searcher_cv.best_estimator_

# Evaluate Model On A Hold-Out Set

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.metrics import roc_auc_score

# Plot ROC curve
confidence_scores = best_estimator.decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, confidence_scores)

%matplotlib inline
import matplotlib.pyplot as plt

title = 'ROC Curve (AUC: {:.3f})'.format(roc_auc_score(y_test, confidence_scores))
plt.style.use('fivethirtyeight')
plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(title)
plt.show()

In [None]:
# Generate the confusion matrix
y_pred = best_estimator.predict(X_test)
df_cm = pd.DataFrame(confusion_matrix(y_test, y_pred)
                     , index=[0, 1]
                     , columns=[0, 1])

import seaborn as sns
plt.style.use('fivethirtyeight')
ax = sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
ax.set_title('Confusion matrix')
ax.set_xlabel('Predicted label')
ax.set_ylabel('Actual label')
ax.tick_params(axis='y', labelrotation=0)
plt.show()

In [None]:
# Generate the classification report
print(classification_report(y_test, y_pred))