# DIABETES PREDICTION MODELLING

## Importing the Required Libraries

In [None]:
# basic analysis library
import sys
import numpy as np
import pandas as pd

# visual eda library
from pandas_profiling import ProfileReport
import webbrowser as web

# visualization libraries
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

# feature engineering library
from sklearn.preprocessing import StandardScaler

# classification modelling libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# performance measurement library
from sklearn import metrics as m

# enable display of complete array/dataframe/series
np.set_printoptions(threshold = sys.maxsize)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# ignoring warnings
import warnings
warnings.filterwarnings('ignore')

print('All Required Libraries Imported')

## Loading the Data

In [None]:
data = pd.read_csv('/kaggle/input/machine-learning-for-diabetes-with-python/diabetes_data.csv')
print(data.shape)
data.head()

## Data Analysis

We have already seen that the data contains only numerical features, let us check whether any of them are stored as text.

In [None]:
# classification of features
numerical = [var for var in data.columns if data[var].dtype != 'O' and var != 'Outcome']
categorical = [var for var in data.columns if data[var].dtype == 'O' and var != 'Outcome']
target = ['Outcome']
print('There are', len(numerical), 'numerical variables')
print('There are', len(categorical), 'categorical variables')
print('There are', len(target), 'target variables')

So there are no features which are stored as text, so we can safely proceed. Let us see the summary statistics of the data.

In [None]:
# summary statistics of data
data.describe()

In [None]:
# number and percentage of null values in data
sum_null = data.isnull().sum()
mean_null = data.isnull().mean()
nulls = pd.concat([sum_null, mean_null], axis = 1)
nulls.rename(columns = {0:'count', 1:'percentage'}, inplace = True)
nulls

From the above table, it is obvious that there are no null values in our data.

In [None]:
# identifying duplicate rows
data[data.duplicated()].shape[0]

Nor there are any duplicate rows.

In [None]:
# correlation matrix of the data
figure = plt.figure(figsize = (10, 10))
corr_matrix = data[numerical].corr().round(2)
sns.heatmap(data = corr_matrix, annot = True)

# the less correlation, the better. More correlation means presence of duplication of features

From the above heatmap, it can be seen that there is a high correlation between Pregnancies and Age (about 54%). Since the data size is small, it is obvious and we can ignore this for now.

In [None]:
# distribution of all features
fig, axes = plt.subplots(ncols = 4, nrows = 2, figsize = (20, 10))

sns.kdeplot(data['Pregnancies'], ax = axes[0,0])
sns.kdeplot(data['Glucose'], ax = axes[0,1])
sns.kdeplot(data['BloodPressure'], ax = axes[0,2])
sns.kdeplot(data['SkinThickness'], ax = axes[0,3])
sns.kdeplot(data['Insulin'], ax = axes[1,0])
sns.kdeplot(data['BMI'], ax = axes[1,1])
sns.kdeplot(data['DiabetesPedigreeFunction'], ax = axes[1,2])
sns.kdeplot(data['Age'], ax = axes[1,3])

In [None]:
# pairplot for data
sns.pairplot(data[numerical])
plt.show()

## Initial Model

Since there are no missing values, let's run a sample model to check on the performance of the model on the current data.

In [None]:
# splitting data into train and test datasets
X = data.drop(['Outcome'], axis = 1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

In [None]:
# RandomForestClassifier model
imodel = RandomForestClassifier()
imodel.fit(X_train, y_train)

In [None]:
# training and testing accuracy
print('Training Accuracy:', imodel.score(X_train, y_train))
print('Testing Accuracy:', imodel.score(X_test, y_test))

The training accuracy is 100%, which indicates that the model is clearly overfitting. The same would be for other tree based models. So tree based algorithms are not suitable for this type of data.

In [None]:
# LogisticRegression model
imodel2 = LogisticRegression()
imodel2.fit(X_train, y_train)

In [None]:
# training and testing accuracy
print('Training Accuracy:', imodel2.score(X_train, y_train))
print('Testing Accuracy:', imodel2.score(X_test, y_test))

Here the training and testing accuracies are quite low, plus the testing accuracy is greater than the training accuracy. So we need to process and correct the data in order to improve the accuracies.

## Outliers Analysis

In [None]:
# copy of orignial data, so that original data does not get overridden
data_clean = data.copy()
data_clean.head()

In [None]:
# outlier distribution of all features
fig, axes = plt.subplots(ncols = 4, nrows = 2, figsize = (20, 10))

sns.boxplot(y = data_clean['Pregnancies'], ax = axes[0,0])
sns.boxplot(y = data_clean['Glucose'], ax = axes[0,1])
sns.boxplot(y = data_clean['BloodPressure'], ax = axes[0,2])
sns.boxplot(y = data_clean['SkinThickness'], ax = axes[0,3])
sns.boxplot(y = data_clean['Insulin'], ax = axes[1,0])
sns.boxplot(y = data_clean['BMI'], ax = axes[1,1])
sns.boxplot(y = data_clean['DiabetesPedigreeFunction'], ax = axes[1,2])
sns.boxplot(y = data_clean['Age'], ax = axes[1,3])

### Pregnancies

In [None]:
# boxplot of Pregnancies to check for outliers
sns.boxplot(y = data_clean['Pregnancies'])

In [None]:
# summary statistics of Pregnancies
print(data_clean['Pregnancies'].describe())
iqr_pr = data_clean['Pregnancies'].describe()['75%'] - data_clean['Pregnancies'].describe()['25%']
print('Inter Quartile Range ', iqr_pr)
lower_limit_pr = data_clean['Pregnancies'].describe()['25%'] - (1.5 * iqr_pr)
upper_limit_pr = data_clean['Pregnancies'].describe()['75%'] + (1.5 * iqr_pr)
print('Lower Limit ', lower_limit_pr)
print('Upper Limit ', upper_limit_pr)

In [None]:
# target level summary statistics
preg_stats = pd.concat([data_clean[data_clean['Outcome'] == 0]['Pregnancies'].describe(), data_clean[data_clean['Outcome'] == 1]['Pregnancies'].describe()], axis = 1)
preg_stats.columns = ['Outcome 0','Outcome 1']
preg_stats

Pregnancies can be 0, but it cannot be as high as the upper whisker value of 13, so it'd be best to replace outliers with the respective median values .

In [None]:
# replacing outliers with 2nd Quartile value

# outcome 0
data_clean.loc[(data_clean['Pregnancies'] > upper_limit_pr) & (data_clean['Outcome'] == 0), 'Pregnancies'] = data_clean[data_clean['Outcome'] == 0]['Pregnancies'].describe()['50%']

# outcome 1
data_clean.loc[(data_clean['Pregnancies'] > upper_limit_pr) & (data_clean['Outcome'] == 1), 'Pregnancies'] = data_clean[data_clean['Outcome'] == 1]['Pregnancies'].describe()['50%']

sns.boxplot(y = data_clean['Pregnancies'])

### Glucose

In [None]:
# boxplot of Glucose to check for outliers
sns.boxplot(y = data_clean['Glucose'])

In [None]:
# summary statistics of Glucose
print(data_clean['Glucose'].describe())
iqr_gl = data_clean['Glucose'].describe()['75%'] - data_clean['Glucose'].describe()['25%']
print('Inter Quartile Range ', iqr_gl)
lower_limit_gl = data_clean['Glucose'].describe()['25%'] - (1.5 * iqr_gl)
upper_limit_gl = data_clean['Glucose'].describe()['75%'] + (1.5 * iqr_gl)
print('Lower Limit ', lower_limit_gl)
print('Upper Limit ', upper_limit_gl)

In [None]:
# target level summary statistics
glu_stats = pd.concat([data_clean[data_clean['Outcome'] == 0]['Glucose'].describe(), data_clean[data_clean['Outcome'] == 1]['Glucose'].describe()], axis = 1)
glu_stats.columns = ['Outcome 0','Outcome 1']
glu_stats

The Glucose value cannot be 0, and it also cannot be as high as the maximum value (199), so we need to replace 0s with respective median values and upper (considerable) outliers with the respectuve 3rd quartile values.

In [None]:
# replacing outliers with quartile values

# outcome 0
data_clean.loc[(data_clean['Glucose'] < data_clean['Glucose'].describe()['25%']) & (data_clean['Outcome'] == 0), 'Glucose'] = data_clean[data_clean['Outcome'] == 0]['Glucose'].describe()['50%']
data_clean.loc[(data_clean['Glucose'] > data_clean['Glucose'].describe()['75%']) & (data_clean['Outcome'] == 0), 'Glucose'] = data_clean[data_clean['Outcome'] == 0]['Glucose'].describe()['75%']

# outcome 1
data_clean.loc[(data_clean['Glucose'] < data_clean['Glucose'].describe()['25%']) & (data_clean['Outcome'] == 1), 'Glucose'] = data_clean[data_clean['Outcome'] == 1]['Glucose'].describe()['50%']
data_clean.loc[(data_clean['Glucose'] > data_clean['Glucose'].describe()['75%']) & (data_clean['Outcome'] == 1), 'Glucose'] = data_clean[data_clean['Outcome'] == 1]['Glucose'].describe()['75%']

sns.boxplot(y = data_clean['Glucose'])

### Blood Presssure

In [None]:
# boxplot of BloodPressure to check for outliers
sns.boxplot(y = data_clean['BloodPressure'])

In [None]:
# summary statistics of BloodPressure
print(data_clean['BloodPressure'].describe())
iqr_bp = data_clean['BloodPressure'].describe()['75%'] - data_clean['BloodPressure'].describe()['25%']
print('Inter Quartile Range ', iqr_bp)
lower_limit_bp = data_clean['BloodPressure'].describe()['25%'] - (1.5 * iqr_bp)
upper_limit_bp = data_clean['BloodPressure'].describe()['75%'] + (1.5 * iqr_bp)
print('Lower Whisker ', lower_limit_bp)
print('Upper Whisker ', upper_limit_bp)

In [None]:
# target level summary statistics
bp_stats = pd.concat([data_clean[data_clean['Outcome'] == 0]['BloodPressure'].describe(), data_clean[data_clean['Outcome'] == 1]['BloodPressure'].describe()], axis = 1)
bp_stats.columns = ['Outcome 0','Outcome 1']
bp_stats

The BloodPressure value cannot be zero, and also it cannot be as high as the maximum values (122 and 114). So we will replace these small values with medians and higher values (>95) with 3rd quartile values.

In [None]:
# replacing outliers with median and 3rd quartile values

# outcome 0
data_clean.loc[(data_clean['BloodPressure'] < data_clean['BloodPressure'].describe()['25%']) & (data_clean['Outcome'] == 0), 'BloodPressure'] = data_clean[data_clean['Outcome'] == 0]['BloodPressure'].describe()['50%']
data_clean.loc[(data_clean['BloodPressure'] > 95) & (data_clean['Outcome'] == 0), 'BloodPressure'] = data_clean[data_clean['Outcome'] == 0]['BloodPressure'].describe()['75%']

# outcome 1
data_clean.loc[(data_clean['BloodPressure'] < data_clean['BloodPressure'].describe()['25%']) & (data_clean['Outcome'] == 1), 'BloodPressure'] = data_clean[data_clean['Outcome'] == 1]['BloodPressure'].describe()['50%']
data_clean.loc[(data_clean['BloodPressure'] > 95) & (data_clean['Outcome'] == 1), 'BloodPressure'] = data_clean[data_clean['Outcome'] == 1]['BloodPressure'].describe()['75%']

sns.boxplot(y = data_clean['BloodPressure'])

### Skin Thickness

In [None]:
# boxplot of SkinThickness to check for outliers
sns.boxplot(y = data_clean['SkinThickness'])

In [None]:
# summary statistics of SkinThickness
print(data_clean['SkinThickness'].describe())
iqr_st = data_clean['SkinThickness'].describe()['75%'] - data_clean['SkinThickness'].describe()['25%']
print('Inter Quartile Range ', iqr_st)
lower_limit_st = data_clean['SkinThickness'].describe()['25%'] - (1.5 * iqr_st)
upper_limit_st = data_clean['SkinThickness'].describe()['75%'] + (1.5 * iqr_st)
print('Lower Limit ', lower_limit_st)
print('Upper Limit ', upper_limit_st)

In [None]:
# target level summary statistics
st_stats = pd.concat([data_clean[data_clean['Outcome'] == 0]['SkinThickness'].describe(), data_clean[data_clean['Outcome'] == 1]['SkinThickness'].describe()], axis = 1)
st_stats.columns = ['Outcome 0','Outcome 1']
st_stats

The SkinThickness value cannot be 0. So we replace this value with the median, and the outliers with the 3rd Quartile values.

In [None]:
# replacing outliers with quartile value

# outcome 0
data_clean.loc[(data_clean['SkinThickness'] < data_clean['SkinThickness'].describe()['50%']) & (data_clean['Outcome'] == 0), 'SkinThickness'] = data_clean[data_clean['Outcome'] == 0]['SkinThickness'].describe()['50%']
data_clean.loc[(data_clean['SkinThickness'] > data_clean['SkinThickness'].describe()['75%']) & (data_clean['Outcome'] == 0), 'SkinThickness'] = data_clean[data_clean['Outcome'] == 0]['SkinThickness'].describe()['75%']

# outcome 1
data_clean.loc[(data_clean['SkinThickness'] < data_clean['SkinThickness'].describe()['50%']) & (data_clean['Outcome'] == 1), 'SkinThickness'] = data_clean[data_clean['Outcome'] == 1]['SkinThickness'].describe()['50%']
data_clean.loc[(data_clean['SkinThickness'] > data_clean['SkinThickness'].describe()['75%']) & (data_clean['Outcome'] == 1), 'SkinThickness'] = data_clean[data_clean['Outcome'] == 1]['SkinThickness'].describe()['75%']

sns.boxplot(y = data_clean['SkinThickness'])

### Insulin

In [None]:
# boxplot of Insulin to check for outliers
sns.boxplot(y = data_clean['Insulin'])

In [None]:
# summary statistics of Insulin
print(data_clean['Insulin'].describe())
iqr_in = data_clean['Insulin'].describe()['75%'] - data_clean['Insulin'].describe()['25%']
print('Inter Quartile Range ', iqr_in)
lower_limit_in = data_clean['Insulin'].describe()['25%'] - (1.5 * iqr_in)
upper_limit_in = data_clean['Insulin'].describe()['75%'] + (1.5 * iqr_in)
print('Lower Limit ', lower_limit_in)
print('Upper Limit ', upper_limit_in)

In [None]:
# target level summary statistics
in_stats = pd.concat([data_clean[data_clean['Outcome'] == 0]['Insulin'].describe(), data_clean[data_clean['Outcome'] == 1]['Insulin'].describe()], axis = 1)
in_stats.columns = ['Outcome 0','Outcome 1']
in_stats

There are outliers above the upper whisker, but insulin level value cannot be 0. So we need to replace these 0s with the median and the outliers with 3rd quartile values. Notice that the median of Insulin where Outcome = 1 is 0, so we replace this value with the overall median value.

In [None]:
# replacing 0s with 2nd quartile value and outliers with 3rd quartile value

# outcome 0
data_clean.loc[(data_clean['Insulin'] < data_clean['Insulin'].describe()['50%']) & (data_clean['Outcome'] == 0), 'Insulin'] = data_clean[data_clean['Outcome'] == 0]['Insulin'].describe()['50%']
data_clean.loc[(data_clean['Insulin'] > data_clean['Insulin'].describe()['75%']) & (data_clean['Outcome'] == 0), 'Insulin'] = data_clean[data_clean['Outcome'] == 0]['Insulin'].describe()['75%']

# outcome 1
data_clean.loc[(data_clean['Insulin'] < data_clean['Insulin'].describe()['50%']) & (data_clean['Outcome'] == 1), 'Insulin'] = data_clean['Insulin'].describe()['50%']
data_clean.loc[(data_clean['Insulin'] > data_clean['Insulin'].describe()['75%']) & (data_clean['Outcome'] == 1), 'Insulin'] = data_clean[data_clean['Outcome'] == 1]['Insulin'].describe()['75%']

sns.boxplot(y = data_clean['Insulin'])

### Body Mass Index (BMI)

In [None]:
# boxplot of BMI to check for outliers
sns.boxplot(y = data_clean['BMI'])

In [None]:
# summary statistics of BMI
print(data_clean['BMI'].describe())
iqr_bmi = data_clean['BMI'].describe()['75%'] - data_clean['BMI'].describe()['25%']
print('Inter Quartile Range ', iqr_bmi)
lower_limit_bmi = data_clean['BMI'].describe()['25%'] - (1.5 * iqr_bmi)
upper_limit_bmi = data_clean['BMI'].describe()['75%'] + (1.5 * iqr_bmi)
print('Lower Limit ', lower_limit_bmi)
print('Upper Limit ', upper_limit_bmi)

In [None]:
# target level summary statistics
bmi_stats = pd.concat([data_clean[data_clean['Outcome'] == 0]['BMI'].describe(), data_clean[data_clean['Outcome'] == 1]['BMI'].describe()], axis = 1)
bmi_stats.columns = ['Outcome 0','Outcome 1']
bmi_stats

BMI value cannot be 0. So we need to replace these outliers with the 1st quartile and 3rd quartile values.

In [None]:
# replacing 0s with 1st quartile value and outliers with 3rd quartile value

# outcome 0
data_clean.loc[(data_clean['BMI'] < data_clean['BMI'].describe()['25%']) & (data_clean['Outcome'] == 0), 'BMI'] = data_clean[data_clean['Outcome'] == 0]['BMI'].describe()['25%']
data_clean.loc[(data_clean['BMI'] > data_clean['BMI'].describe()['75%']) & (data_clean['Outcome'] == 0), 'BMI'] = data_clean[data_clean['Outcome'] == 0]['BMI'].describe()['75%']

# outcome 1
data_clean.loc[(data_clean['BMI'] < data_clean['BMI'].describe()['25%']) & (data_clean['Outcome'] == 1), 'BMI'] = data_clean[data_clean['Outcome'] == 1]['BMI'].describe()['25%']
data_clean.loc[(data_clean['BMI'] > data_clean['BMI'].describe()['75%']) & (data_clean['Outcome'] == 1), 'BMI'] = data_clean[data_clean['Outcome'] == 1]['BMI'].describe()['75%']

sns.boxplot(y = data_clean['BMI'])

### Diabetes Pedigree Function

In [None]:
# boxplot of DiabetesPedigreeFunction to check for outliers
sns.boxplot(y = data_clean['DiabetesPedigreeFunction'])

In [None]:
# summary statistics of DiabetesPedigreeFunction
print(data_clean['DiabetesPedigreeFunction'].describe())
iqr_dpf = data_clean['DiabetesPedigreeFunction'].describe()['75%'] - data_clean['DiabetesPedigreeFunction'].describe()['25%']
print('Inter Quartile Range ', iqr_dpf)
lower_limit_dpf = data_clean['DiabetesPedigreeFunction'].describe()['25%'] - (1.5 * iqr_dpf)
upper_limit_dpf = data_clean['DiabetesPedigreeFunction'].describe()['75%'] + (1.5 * iqr_dpf)
print('Lower Limit ', lower_limit_dpf)
print('Upper Limit ', upper_limit_dpf)

In [None]:
# target level summary statistics
dpf_stats = pd.concat([data_clean[data_clean['Outcome'] == 0]['DiabetesPedigreeFunction'].describe(), data_clean[data_clean['Outcome'] == 1]['DiabetesPedigreeFunction'].describe()], axis = 1)
dpf_stats.columns = ['Outcome 0','Outcome 1']
dpf_stats

The lower values are valid, but we still need to replace the outliers. We will replace these outliers with the 3rd quartile values.

In [None]:
# replacing outliers with 3rd quartile value

# outcome 0
data_clean.loc[(data_clean['DiabetesPedigreeFunction'] > data_clean['DiabetesPedigreeFunction'].describe()['75%']) & (data_clean['Outcome'] == 0), 'DiabetesPedigreeFunction'] = data_clean[data_clean['Outcome'] == 0]['DiabetesPedigreeFunction'].describe()['75%']

# outcome 1
data_clean.loc[(data_clean['DiabetesPedigreeFunction'] > data_clean['DiabetesPedigreeFunction'].describe()['75%']) & (data_clean['Outcome'] == 1), 'DiabetesPedigreeFunction'] = data_clean[data_clean['Outcome'] == 1]['DiabetesPedigreeFunction'].describe()['75%']

sns.boxplot(y = data_clean['DiabetesPedigreeFunction'])

### Age

In [None]:
# boxplot of Age to check for outliers
sns.boxplot(y = data_clean['Age'])

In [None]:
# summary statistics of Age
print('Mean\t', data_clean['Age'].mean())
print('Median\t', data_clean['Age'].median())
print(data_clean['Age'].describe())
iqr_age = data_clean['Age'].describe()['75%'] - data_clean['Age'].describe()['25%']
print('Inter Quartile Range ', iqr_age)
lower_limit_age = data_clean['Age'].describe()['25%'] - (1.5 * iqr_age)
upper_limit_age = data_clean['Age'].describe()['75%'] + (1.5 * iqr_age)
print('Lower Limit ', lower_limit_age)
print('Upper Limit ', upper_limit_age)

In [None]:
# target level summary statistics
age_stats = pd.concat([data_clean[data_clean['Outcome'] == 0]['Age'].describe(), data_clean[data_clean['Outcome'] == 1]['Age'].describe()], axis = 1)
age_stats.columns = ['Outcome 0','Outcome 1']
age_stats

All the other values look valid, so we need to replace the outliers with the 3rd quartile values. Since the upper whisker value (66) itself is an outlier (according to boxlpot), we can reduce it to 60.

In [None]:
# replacing outliers with 3rd quartile value

# outcome 0
data_clean.loc[(data_clean['Age'] > 60) & (data_clean['Outcome'] == 0), 'Age'] = data_clean[data_clean['Outcome'] == 0]['Age'].describe()['75%']

# outcome 1
data_clean.loc[(data_clean['Age'] > 60) & (data_clean['Outcome'] == 1), 'Age'] = data_clean[data_clean['Outcome'] == 1]['Age'].describe()['75%']

sns.boxplot(y = data_clean['Age'])

Now that all the features have been cleaned, let us compare the distributions of the features before and after cleaning.

In [None]:
# distribution of all features before and after cleaning
fig, axes = plt.subplots(ncols = 4, nrows = 2, figsize = (20, 10))

# before cleaning
sns.kdeplot(data['Pregnancies'], ax = axes[0,0])
sns.kdeplot(data['Glucose'], ax = axes[0,1])
sns.kdeplot(data['BloodPressure'], ax = axes[0,2])
sns.kdeplot(data['SkinThickness'], ax = axes[0,3])
sns.kdeplot(data['Insulin'], ax = axes[1,0])
sns.kdeplot(data['BMI'], ax = axes[1,1])
sns.kdeplot(data['DiabetesPedigreeFunction'], ax = axes[1,2])
sns.kdeplot(data['Age'], ax = axes[1,3])

# after cleaning
sns.kdeplot(data_clean['Pregnancies'], ax = axes[0,0], color = 'green')
sns.kdeplot(data_clean['Glucose'], ax = axes[0,1], color = 'green')
sns.kdeplot(data_clean['BloodPressure'], ax = axes[0,2], color = 'green')
sns.kdeplot(data_clean['SkinThickness'], ax = axes[0,3], color = 'green')
sns.kdeplot(data_clean['Insulin'], ax = axes[1,0], color = 'green')
sns.kdeplot(data_clean['BMI'], ax = axes[1,1], color = 'green')
sns.kdeplot(data_clean['DiabetesPedigreeFunction'], ax = axes[1,2], color = 'green')
sns.kdeplot(data_clean['Age'], ax = axes[1,3], color = 'green')

In [None]:
# pairplot for cleaned data
sns.pairplot(data_clean[numerical])
plt.show()

The above pairplot shows a neater correlation of all features with each other compared to the earlier one. Now that all variables are corrected, let's standardize the data.

## Standardization

In [None]:
data_scal = data_clean.copy()
data_scal.head()

Take a look at the data above. All the features are at different scales. This would definitely affect the performance of the model. So we need to transform the data so that all features are at a common scale. For this purpose, we use Standardization.

In [None]:
# splitting into train and test datasets
X = data_scal.drop(['Outcome'], axis = 1)
y = data_scal['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

In [None]:
# fitting the scaler on the train data
scaler = StandardScaler()
scaler.fit(X_train[numerical])

In [None]:
# transforming X_train and X_test
X_train[numerical] = scaler.transform(X_train[numerical])
X_test[numerical] = scaler.transform(X_test[numerical])
X_train.head()

As we can see, all the features are now at a common scale. This would definitely help in the model performance. Now that our data is ready to be fed to the model, let us run the model.

## Feeding Processed Data to Model

As discussed in the beginning, tree-based algorithms (like Decision Tree, Random Forest, Gradient Boost etc.) are not suitable for this data as they overfit on this data. So we start with Logistic Regression.

In [None]:
# Logistic Regression model
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

In [None]:
# training and testing accuracy
print('Logistic Training Accuracy:', model_lr.score(X_train, y_train))
print('Logistic Testing Accuracy:', model_lr.score(X_test, y_test))

The accuracies have improved very much compared to the initial model we ran at the beginning. So it can be said that our data was cleaned and scaled properly and effectively.

Let us feed the data to few other models as well and check their performance.

In [None]:
# Support Vector model
model_svm = SVC()
model_svm.fit(X_train, y_train)

# training and testing accuracy
print('SVM Training Accuracy:', model_svm.score(X_train, y_train))
print('SVM Testing Accuracy:', model_svm.score(X_test, y_test))

In [None]:
# KNeighborsClassifier model
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

# training and testing accuracy
print('KNN Training Accuracy:', model_knn.score(X_train, y_train))
print('KNN Testing Accuracy:', model_knn.score(X_test, y_test))

In [None]:
# MLPClassifier model
model_mlp = MLPClassifier()
model_mlp.fit(X_train, y_train)

# training and testing accuracy
print('MLP Training Accuracy:', model_mlp.score(X_train, y_train))
print('MLP Testing Accuracy:', model_mlp.score(X_test, y_test))

## Hyperparameter Tuning

Now that we have trained and tested for the accuracies, let's tune the hyperparameters by using the GridSearchCV function.

In [None]:
# best parameter function
def print_results(y_pred, model):
    print('\nBest Parameters:',model.best_params_)
    print('\nPrediction Metrics:\n')
    print('Training Accuracy:', model.score(X_train, y_train))
    print('Testing Accuracy:', model.score(X_test, y_test))
    print('Precision:', m.precision_score(y_test, y_pred, average = 'weighted'))
    print('Recall:', m.recall_score(y_test, y_pred, average = 'weighted'))
    print('F1-Score:', m.f1_score(y_test, y_pred, average = 'weighted'))

In [None]:
# Logistic Regression Model
model_lr = LogisticRegression(random_state = 0)
params_lr = {'C':[1,5,10,50], 'solver':['newton-cg','lbfgs','liblinear','sag','saga'], 'max_iter':[50,100,500]}
grid_lr = GridSearchCV(model_lr, params_lr, scoring = 'accuracy', cv = 5, verbose = 5, n_jobs = -1, return_train_score = True)
grid_lr.fit(X_train, y_train)

In [None]:
# best parameters
y_pred_lr = grid_lr.predict(X_test)
print_results(y_pred_lr, grid_lr)

In [None]:
# Support Vector Model
model_svm = SVC(random_state = 0)
params_svm = {'C':[1,5,10,50], 'kernel':['rbf','poly','sigmoid','linear']}
grid_svm = GridSearchCV(model_svm, params_svm, scoring = 'accuracy', cv = 5, verbose = 5, n_jobs = -1, return_train_score = True)
grid_svm.fit(X_train, y_train)

In [None]:
# best parameters
y_pred_svm = grid_svm.predict(X_test)
print_results(y_pred_svm, grid_svm)

In [None]:
# K-Nearest Neighbors Model
model_knn = KNeighborsClassifier()
params_knn = {'n_neighbors':[5,10,20,50]}
grid_knn = GridSearchCV(model_knn, params_knn, scoring = 'accuracy', cv = 5, verbose = 5, n_jobs = -1, return_train_score = True)
grid_knn.fit(X_train, y_train)

In [None]:
# best parameters
y_pred_knn = grid_knn.predict(X_test)
print_results(y_pred_knn, grid_knn)

In [None]:
# MLP Classifier
model_nn = MLPClassifier(random_state = 0)
params_nn = {'solver':['lbfgs','sgd','adam'], 'hidden_layer_sizes':[(50,50,50),(50,100,50),(100,)], 
             'learning_rate':['constant','invscaling','adaptive'], 'activation':['identity','logistic','tanh','relu']}
grid_nn = GridSearchCV(model_nn, params_nn, scoring = 'accuracy', cv = 5, verbose = 5, n_jobs = -1, return_train_score = True)
grid_nn.fit(X_train, y_train)

In [None]:
# best parameters
y_pred_nn = grid_nn.predict(X_test)
print_results(y_pred_nn, grid_nn)

## Results

In [None]:
# performance metrics dataframe
perf_mets = pd.DataFrame({'Model':['LR','SVM','KNN','MLP'],
                          'Training Accuracy':[grid_lr.score(X_train, y_train), grid_svm.score(X_train, y_train), 
                                               grid_knn.score(X_train, y_train), grid_nn.score(X_train, y_train)],
                          'Testing Accuracy':[grid_lr.score(X_test, y_test), grid_svm.score(X_test, y_test), 
                                               grid_knn.score(X_test, y_test), grid_nn.score(X_test, y_test)],
                          'Precision':[m.precision_score(y_test, y_pred_lr, average = 'weighted'), 
                                      m.precision_score(y_test, y_pred_svm, average = 'weighted'), 
                                      m.precision_score(y_test, y_pred_knn, average = 'weighted'),
                                      m.precision_score(y_test, y_pred_nn, average = 'weighted')],
                          'Recall':[m.recall_score(y_test, y_pred_lr, average = 'weighted'), 
                                   m.recall_score(y_test, y_pred_svm, average = 'weighted'), 
                                   m.recall_score(y_test, y_pred_knn, average = 'weighted'), 
                                   m.recall_score(y_test, y_pred_nn, average = 'weighted')],
                          'F1-Score':[m.f1_score(y_test, y_pred_lr, average = 'weighted'), 
                                     m.f1_score(y_test, y_pred_svm, average = 'weighted'), 
                                     m.f1_score(y_test, y_pred_knn, average = 'weighted'), 
                                     m.f1_score(y_test, y_pred_nn, average = 'weighted')]
                         }).set_index('Model')

perf_mets_perc = perf_mets.style.format({'Training Accuracy': '{:,.2%}'.format,
                                    'Testing Accuracy': '{:,.2%}'.format,
                                    'Precision': '{:,.2%}'.format,
                                    'Recall': '{:,.2%}'.format,
                                    'F1-Score': '{:,.2%}'.format})
perf_mets_perc

In [None]:
# comparing the performance metrics

fig, axes = plt.subplots(ncols = 3, nrows = 2, figsize = (15, 10))

sns.barplot(x = perf_mets.index, y = perf_mets['Training Accuracy'], ax = axes[0,0], order = perf_mets.sort_values('Training Accuracy', ascending = False).index)
sns.barplot(x = perf_mets.index, y = perf_mets['Testing Accuracy'], ax = axes[0,1], order = perf_mets.sort_values('Testing Accuracy', ascending = False).index)
sns.barplot(x = perf_mets.index, y = perf_mets['Precision'], ax = axes[0,2], order = perf_mets.sort_values('Precision', ascending = False).index)
sns.barplot(x = perf_mets.index, y = perf_mets['Recall'], ax = axes[1,0], order = perf_mets.sort_values('Recall', ascending = False).index)
sns.barplot(x = perf_mets.index, y = perf_mets['F1-Score'], ax = axes[1,1], order = perf_mets.sort_values('F1-Score', ascending = False).index)

fig.delaxes(axes[1,2])

for i in range(2):
    for j in range(3):
        for bar in axes[i,j].patches:
            axes[i,j].annotate(format(bar.get_height(), '.2%'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha = 'center', va = 'center', size = 15, xytext = (0, 8), textcoords = 'offset points')

fig.tight_layout()
plt.show()

The above plots shows the performance metrics of each model in decreasing order of their magnitudes. As per the plots, we can see that MLP Classifer performs better than the other algorithms.

## Summary

In this notebook, 
1. We have analyzed the data, the number of numerical and categorial variables and also the summary statistics of the dataset.
2. We have run an initial model to check the performance of the model on the data.
3. We have analyzed the outliers and handled them effectively with suitable techniques.
4. We have standardized the features so that all features are on a common scale.
5. We have run the models on the corrected and standardized data and have found out its performance metrics.
6. We have tuned the hyperparameters of the models to get the model with best set of hyperparameters.