In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings("ignore")

Reading Data

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

# Understanding the data

In [None]:
df.head()

In [None]:
df.info()

In [None]:
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True, annot=True);

In [None]:
dict = {}
for i in list(df.columns):
    dict[i] = df[i].value_counts().shape[0]

pd.DataFrame(dict,index=["unique count"]).transpose()

Separating into features and targets

In [None]:
#features = columns for column in df.columns if column not in "Outcome"
con_cols = list(df.drop('Outcome',axis=1).columns)
target = ['Outcome']
print("Feature columns: ",con_cols)
print("Target columns: ",target)
df.head()

Checking for Missing Values

In [None]:
df.isnull().sum()

Box Plot

In [None]:
fig, axs = plt.subplots(9)
fig.set_figwidth(8)
fig.set_figheight(45)
i=0
for col in df.columns:
    sns.boxplot(y=df[col], ax=axs[i])
    i=i+1

Histogram

In [None]:
fig, axs = plt.subplots(9)
fig.set_figwidth(8)
fig.set_figheight(45)
i=0
for col in df.columns:
    sns.histplot(x=df[col], ax=axs[i],kde=True)
    i=i+1

Skewness along the index axis

In [None]:
df.skew(axis = 0, skipna = True)

# Data Preprocessing

Removing the outliers

In [None]:
df.drop(df[df["Pregnancies"] > 14].index,inplace=True)
df.drop(df[df["Glucose"] < 50].index,inplace=True)
df.drop(df[df["BloodPressure"] > 120].index,inplace=True)
df.drop(df[df["SkinThickness"] > 80].index,inplace=True)
df.drop(df[df["Insulin"] > 600].index,inplace=True)
df.drop(df[df["BMI"] > 55].index,inplace=True)
df.drop(df[df["DiabetesPedigreeFunction"] > 2].index,inplace=True)
df.drop(df[df["Age"] > 70].index,inplace=True)

print("Shape of dataset: ", df.shape)

Removing the skewness

In [None]:
# for Insulin
fig, axs = plt.subplots(2)
sns.kdeplot(df['Insulin'],color='Purple',fill=True, ax=axs[0])
# Removing the skewness using a log function and checking the distribution again
df['Insulin'] = df['Insulin'].map(lambda i : np.log(i) if i > 0 else 0)
sns.kdeplot(df['Insulin'],color='Orange',fill=True, ax=axs[1])
df['Insulin'].skew(axis = 0, skipna = True)

In [None]:
# for DiabetesPedigreeFunction
fig, axs = plt.subplots(2)
sns.kdeplot(df['DiabetesPedigreeFunction'],color='Purple',fill=True, ax=axs[0])
# Removing the skewness using a log function and checking the distribution again
df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'].map(lambda i : np.log(i) if i > 0 else 0)
sns.kdeplot(df['DiabetesPedigreeFunction'],color='Orange',fill=True, ax=axs[1])
df['DiabetesPedigreeFunction'].skew(axis = 0, skipna = True)

In [None]:
# for Age
fig, axs = plt.subplots(2)
sns.kdeplot(df['Age'],color='Purple',fill=True, ax=axs[0])
# Removing the skewness using a log function and checking the distribution again
df['Age'] = df['Age'].map(lambda i : np.log(i) if i > 0 else 0)
sns.kdeplot(df['Age'],color='Orange',fill=True, ax=axs[1])
df['Age'].skew(axis = 0, skipna = True)

Making features model ready

In [None]:
# creating a copy of dataframe
df1 = df

# separating the features and target 
X = df1.drop(['Outcome'],axis=1)
y = df1[['Outcome']]
feature_cols = list(X.columns)


Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[feature_cols] = sc.fit_transform(X_train[feature_cols])
X_test[feature_cols] = sc.transform(X_test[feature_cols])

In [None]:
X_train.head

# Modeling

Packages

In [None]:
# Base Models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Ensembling and Boosting
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Metrics
from sklearn.metrics import accuracy_score,classification_report

# Hyper-parameter tuning
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Base Modeling

In [None]:
models = [
    ('SVC', SVC()),
    ('DecisionTreeClassifier',DecisionTreeClassifier()),
    ('KNeighborsClassifier',KNeighborsClassifier()),
    ('LogisticRegression',LogisticRegression()),
    ('RandomForestClassifier',RandomForestClassifier()),
    ('GradientBoostingClassifier',GradientBoostingClassifier())
]


print("The accuracy scores of the models are :")
for model_name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model_name, ": ", accuracy_score(y_test,y_pred))

# Hyperparameter tuning using GridSearchCV

Defining Function for Hyperparameter tuning using Grid Search CV

In [None]:
def AccuracyHyperParameterTune(classifier,param_grid,X_train, y_train, X_test, y_test):

    # initialize grid search
    grid = GridSearchCV(
    estimator=classifier, param_grid=param_grid, scoring="accuracy", verbose=1,
    n_jobs=1,cv=5 )

    # fit the model and extract best score
    grid.fit(X_train, y_train)

    print("Support Vector Classifier: ", grid.best_score_)
    print("Best parameters set:")
    print(grid.best_params_)

    # Getting Accuracy
    y_pred = grid.best_estimator_.predict(X_test)
    print("Classification Report")
    print(classification_report(y_test, y_pred))
    return grid.best_estimator_;

Decision Tree Classifier tuning

In [None]:
# define the model
classifier = DecisionTreeClassifier()

# define a grid of parameters
param_grid = {'criterion':['gini','entropy'],
              'splitter':['best','random'],
              'max_depth':[2,3,4,5,6,7,8],
              'max_features':['auto','sqrt','log2'],
             }

gridBestEstimator = AccuracyHyperParameterTune(classifier,param_grid,X_train, y_train, X_test, y_test)
print(gridBestEstimator)

K Neighbors Classifier tuning

In [None]:
# define the model
classifier = KNeighborsClassifier()

# define a grid of parameters
param_grid = {'n_neighbors':[2,3,4,5,6,7,8],
              'weights':['uniform','distance'],
              'algorithm':['auto','ball_tree','kd_tree','brute'],
              'leaf_size':[26,27,28,29,30,31]
             }

gridBestEstimator = AccuracyHyperParameterTune(classifier,param_grid,X_train, y_train, X_test, y_test)
print(gridBestEstimator)

SV Classifier tuning

In [None]:
# define the model
classifier = SVC()

# define a grid of parameters
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear','rbf']
             }

gridBestEstimator = AccuracyHyperParameterTune(classifier,param_grid,X_train, y_train, X_test, y_test)
print(gridBestEstimator)

LogisticRegression Classifier tuning

In [None]:
# define the model
classifier = LogisticRegression()

# define a grid of parameters
param_grid = {'C': np.logspace(-4, 4, 50),
              'penalty': ['l1', 'l2']
             }

gridBestEstimator = AccuracyHyperParameterTune(classifier,param_grid,X_train, y_train, X_test, y_test)
print(gridBestEstimator)

RandomForestClassifier tuning

In [None]:
# define the model
classifier = RandomForestClassifier()

# define a grid of parameters
param_grid = {'bootstrap': [True, False],
              'max_depth': [10, 40, None],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10],
              'n_estimators': [200, 600, 1400]
             }

gridBestEstimator = AccuracyHyperParameterTune(classifier,param_grid,X_train, y_train, X_test, y_test)
print(gridBestEstimator)