# About

This is a database about diabetes in India. 

__Columns__
* Pregnancies - Number of times pregnant
* Glucose - Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* BloodPressure - Diastolic blood pressure (mm Hg)
* SkinThickness - Triceps skin fold thickness (mm)
* Insulin - 2-Hour serum insulin (mu U/ml)
* BMI - Body mass index (weight in kg/(height in m)^2)
* DiabetesPedigreeFunction - Diabetes pedigree function
* Age - Age (years)
* Outcome - Class variable (0 or 1) 268 of 768 are 1, the others are 0

# Importing

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

In [None]:
base = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
np.random.seed(42)

# Data Exploration

In [None]:
base.head()

Verifing Outcome numbers

In [None]:
sns.barplot(base.Outcome, base.Outcome.value_counts());
plt.grid()

There are some 0 variables. It can indicate they are NaN values.

In [None]:
base.describe().T

A pairplot to see the attributes

In [None]:
sns.pairplot(base, hue="Outcome", vars=base.columns[1:-1])
plt.show()

Replacing 0 to NaN values of the columns. I didn't change the Pregnancies column, because it might be a man.

In [None]:
na_columns = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
base[na_columns] = base[na_columns].replace(0, np.nan)

The percentage of NaN values

In [None]:
base.isna().mean() * 100

Glucose, Insulin and Skin Thickness may have something to do with diabetes. So I will fill NaN values with their median.

In [None]:
df = base.copy()

for column in ["Glucose", "SkinThickness", "Insulin"]:
    median_0 = base[column][base["Outcome"] == 0].median()
    median_1 = base[column][base["Outcome"] == 1].median()
    
    df[column][df["Outcome"] == 0] = base[column][df["Outcome"] == 0].fillna(median_0)
    df[column][df["Outcome"] == 1] = base[column][df["Outcome"] == 1].fillna(median_1)

In [None]:
df.BloodPressure.fillna(df.BloodPressure.median(), inplace=True)
df.BMI.fillna(df.BMI.median(), inplace=True)

Spliting into X and y variables

In [None]:
X = df.drop("Outcome", axis=1)
X.head()

In [None]:
y = df.Outcome
y.head()

Creating metrics for evaluation

In [None]:
f1 = metrics.make_scorer(metrics.f1_score)
accuracy = metrics.make_scorer(metrics.accuracy_score)
precision = metrics.make_scorer(metrics.precision_score)
recall = metrics.make_scorer(metrics.recall_score)
auc = metrics.make_scorer(metrics.roc_auc_score)

In [None]:
scoring = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1
}

In [None]:
def printResults(cv):
    print("Accuracy  {:.3f} ({:.3f})".format(cv["test_accuracy"].mean(), cv["test_accuracy"].std()))
    print("Precision {:.3f} ({:.3f})".format(cv["test_precision"].mean(), cv["test_precision"].std()))
    print("Recall    {:.3f} ({:.3f})".format(cv["test_recall"].mean(), cv["test_recall"].std()))
    print("F1        {:.3f} ({:.3f})".format(cv["test_f1"].mean(), cv["test_f1"].std()))

Spliting into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

   # GradientBoostingClassifier Model

I'll create a simple model 

In [None]:
gbc = GradientBoostingClassifier()

Simple Train and Test Split to predict and evaluate

In [None]:
gbc.fit(X_train, y_train)

In [None]:
y_pred = gbc.predict(X_test)

## Classification Report

In [None]:
print(metrics.classification_report(y_test, y_pred))

## Confusion Matrix

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap="Blues");

## Cross Validation

In [None]:
cv_gbc = cross_validate(gbc, X, y, scoring=scoring, cv=5)

In [None]:
printResults(cv_gbc)

# Model Tuning - GradientBoostingClassifier

In [None]:
params = {
    "loss": ["deviance", "exponential"],
    "learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
}

gs = GridSearchCV(estimator=gbc, param_grid=params, cv=5)

In [None]:
gs.fit(X, y)

In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [None]:
gbc_best = GradientBoostingClassifier(learning_rate=0.2, loss='exponential', n_estimators=100)

In [None]:
gbc_best.fit(X_train, y_train)

In [None]:
y_pred = gbc_best.predict(X_test)

## Classification Report

In [None]:
print(metrics.classification_report(y_test, y_pred))

## Confusion Matrix

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap="Blues");

## Cross Validation

In [None]:
cv_gbc_best = cross_validate(gbc_best, X, y, cv=5, scoring=scoring)

In [None]:
printResults(cv_gbc_best)