# Diabetes prediction for the PIMA Indian Diabetes Database

## Context

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

## Data Description

- `Pregnancies` - Number of times pregnant
- `Glucose` - Plasma glucose concentration a 2 hours in an oral glucose tolerance test
- `Blood pressure` - Diastolic blood pressure (mm Hg)
- `SkinThickness` - Triceps skin fold thickness (mm)
- `Insulin` - 2-Hour serum insulin (mu U/ml)
- `BMI` - Body mass index (weight in kg/(height in m)^2)
- `DiabetesPedigreeFunction` - Diabetes pedigree function
- `Age` - Age (years)
- `Outcome` - Class variable (0 or 1) 268 of 768 are 1, the others are 0

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df.head()

# Basic EDA

In [None]:
df.info()

In [None]:
df.describe().T

# Handling Missing Values

In this dataset missing data are filled with 0. First, we are gonna change zeros with NaN

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness',
                                                                      'Insulin','BMI']].replace(0, np.NaN)

In [None]:
df.isnull().sum()

**Filling the missing values with median values**

In [None]:
def median_target(var):   
    temp = df[df[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

In the above we are finding the median value the separate outcomes and then filling it in the missing values accordingly

In [None]:
cols = df.columns
cols = cols.drop('Outcome')

for col in cols:
    median_target(col)
    
    df.loc[(df['Outcome'] == 0) & (df[col].isnull()), col] = median_target(col)[col][0]
    df.loc[(df['Outcome'] == 1) & (df[col].isnull()), col] = median_target(col)[col][1]

In [None]:
df.isnull().sum()

Now as we can see all the missing values have been filled with their median values according to the target.

# Data visualisation

In [None]:
sns.pairplot(df)

In [None]:
df.hist(bins=15,figsize=(20,20));

In [None]:
sns.countplot(df['Outcome'])

# Scaling the data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for col in df.columns[:-1]:
    df[col] = scaler.fit_transform(df[[col]])

In [None]:
df.head()

In [None]:
df.hist(bins=15,figsize=(20,20));

# Splitting data

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Outcome',axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [None]:
def plt_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1],"k--")
    plt.tight_layout()
    plt.axis([0,1,0,1])
    plt.legend()
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

# Model Building

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


model_rf = RandomForestClassifier(n_estimators=100)
model_ext = ExtraTreesClassifier()
model_ada = AdaBoostClassifier()
model_grad = GradientBoostingClassifier()
model_logis = LogisticRegression()
model_dec = DecisionTreeClassifier()

models = [model_rf, model_ext, model_ada, model_grad, model_logis, model_dec]

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, f1_score

In [None]:
def model_train(model):
    model.fit(X_train,y_train)
    
    y_pred = model.predict(X_test)
    fpr, tpr, thresholds = roc_curve(y_test,y_pred)
    print(f'model: {model}')
    print(classification_report(y_test,y_pred))
    plt_roc_curve(fpr,tpr,label=f'{model}')
    
    return f1_score(y_test,y_pred)

In [None]:
scores = {'Random Forest':0,
          'ExtraTress': 0,
          'AdaBoost':0,
          'Gradient Boosting':0,
          'Logistic Regression':0,
          'Decision Tree':0}


scr = []
for i,model in enumerate(models):
    score = model_train(model)
    scr.append(score)

From the above we can see that Gradient Boosting algorithm is performing well

Without any feature engineering we got higher ROC

# K-Fold cross validated models with RandomSearchCV for best params

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import make_pipeline

In [None]:
def ml_model(model, parameters):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)
    random_search = RandomizedSearchCV(model, parameters, cv=cv, random_state=1, n_jobs=-1, verbose=2 )
    #pipe = make_pipeline(StandardScaler(),random_search)
    random_search.fit(X_train, y_train)
    y_pred_proba = random_search.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    print("ROC Score : ",roc_auc_score(y_test, y_pred_proba))
    print("Accuracy for train: ", accuracy_score(y_train, random_search.predict(X_train)))
    print("Accuracy for test: " , accuracy_score(y_test, random_search.predict(X_test)))
    print("Best params:" + str(random_search.best_params_))
    plt_roc_curve(fpr,tpr)
    
    return random_search
    
log_reg_params = {"C" : [1,2,3,0.01,0.001, 2.5, 1.5],
                  "max_iter" : range(100,800,100)}
knn_params = {"n_neighbors" : np.arange(1,50),
              "leaf_size" : np.arange(1,50)}
decTree_params = {"max_depth" : [5,10,15,20,25,30],
                  "min_samples_split" : np.arange(2,50),
                  "min_samples_leaf" : np.arange(1,50)}
randomForest_params = {"n_estimators" : [100,500, 1000],
                       "min_samples_split" : np.arange(2,30),
                       "min_samples_leaf" : np.arange(1,50),
                       "max_features" : np.arange(1,7)}
grad_params = {"n_estimators" : [100,500,1000],
               "subsample" : [0.6,0.8,1.0],
               "max_depth" : [5,10,15,20,25,30],
               "learning_rate" : [0.1, 0.01, 0.02, 0.5]
               }

sgd_params = {"alpha" : [0.0001, 0.1, 0.001, 0.01],
              "max_iter" : [100,500,1000,2000],
              "loss" : ["log","modified_huber","perceptron"]}

In [None]:
ml_model(LogisticRegression(), log_reg_params)

In [None]:
ml_model(KNeighborsClassifier(), knn_params)

In [None]:
ml_model(DecisionTreeClassifier(), decTree_params)

In [None]:
ml_model(RandomForestClassifier(), randomForest_params)

In [None]:
ml_model(SGDClassifier(), sgd_params)

In [None]:
ml_model(GradientBoostingClassifier(), grad_params)

**We got highest roc_score of 95.4% for the `GradientBoosting Classifier`, we are storing the model with that best params and we are going to use that for the prediction**

In [None]:
model_final_gradboost = GradientBoostingClassifier(subsample = 0.6,
                                                n_estimators = 500, 
                                                max_depth = 20,
                                                learning_rate = 0.01)

In [None]:
model_final_gradboost.fit(X_train, y_train)

In [None]:
y_pred_proba = model_final_gradboost.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
print("ROC Score : ",roc_auc_score(y_test, y_pred_proba))
print("Accuracy for train: ", accuracy_score(y_train, model_final_gradboost.predict(X_train)))
print("Accuracy for test: " , accuracy_score(y_test, model_final_gradboost.predict(X_test)))
plt_roc_curve(fpr,tpr)

In [None]:
y_pred = model_final_gradboost.predict(X_test)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)

# Storing the Best Model

In [None]:
import pickle

with open('model_gradboost.pkl','wb') as file:
    pickle.dump(model_final_random,file)

**We have successfully stored our model in our storage**