In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

The notebook aims to address the classification of Diabetes using useful parameters and a suitable model while going through the process of modelling, ie: Cleaning the Data, Visualizing the Data, Correlation and check for multicollinearity, Normalization, Modelling-Testing and Visualizing the Model.

# **1. Cleaning the Data**

In [None]:
#Read the data set
df=pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
#Display first 5 rows of data set
df.head()

**This data set has 9 columns depicting number of pregnancies, glucose level, blood pressure, skin thickness, insulin level in blood, Body Mass Index (BMI), DiabetesPedigreeFunction, Age and final outcome whether a person has diabetes or not.**

All the columns except outcome are quantitative data type while outcome is categorical. Hence no encoding will be required for this data set.

In [None]:
#Display last 5 rows of dataset
df.tail()

**This gives the summary of last columns of dataset.**

In [None]:
#Check shape of the date
df.shape

**This dataset has 768 rows and 9 columns.**

In [None]:
#Information about the data
df.info()

In [None]:
#statstical summary of the data
df.describe()

**This column represents count, mean,standard deviation, minimum, maximum and interquartile values for each column.**

In [None]:
#Finding null or missing values
df.isnull().any()

**In the summary of first and last five rows of dataset as presented by head and tail function there are some columns with value as 0 which doesn't make sense (like the blood glucose / blood pressure cannot be 0 for patients who are alive). We need to replace such values**

In [None]:
#Replacing 0 with null value(NaN)
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure',
'SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df.head()

In [None]:
df.isnull().any()

In [None]:
#Now find the total number of missing values
df.isnull().sum()

**There are 35, 227 and 374 missing values in the columns BloodPressure, Skin Thickness and Insulin respectively.**
We will now replace these null values with median values to make the data valid for modeling.

In [None]:
#Replace null values with median
df['Glucose'].fillna(df['Glucose'].median(), inplace=True)
df['BloodPressure'].fillna(df['BloodPressure'].median(), inplace=True)
df['SkinThickness'].fillna(df['SkinThickness'].median(), inplace=True)
df['Insulin'].fillna(df['Insulin'].median(), inplace=True)
df['BMI'].fillna(df['BMI'].median(), inplace=True)
df.head()

# **2. Visualizing the data**

**Let's analyse the data set through plots**

In [None]:
#Histogram
df.hist(figsize=(20,20))

In [None]:
#Checking the skewness of the data
import matplotlib.pyplot as plt
import seaborn as sns

fig,axes=plt.subplots(4,2, figsize=(12,12))
sns.distplot(df['Pregnancies'],ax=axes[0,0])
sns.distplot(df['Glucose'],ax=axes[0,1])
sns.distplot(df['BloodPressure'],ax=axes[1,0])
sns.distplot(df['SkinThickness'],ax=axes[1,1])
sns.distplot(df['Insulin'],ax=axes[2,0])
sns.distplot(df['BMI'],ax=axes[2,1])
sns.distplot(df['DiabetesPedigreeFunction'],ax=axes[3,0])
sns.distplot(df['Age'],ax=axes[3,1])
plt.show()


**Plots show that Pregnancies, Insulin, age and DiabetesPedigreeFunction are rightly skewd while Glucose, Blood Pressure, BMI and skin thickness have normal distribution.**

In [None]:
fig,axes=plt.subplots(4,2, figsize=(16,16))
sns.boxplot(df['Pregnancies'],ax=axes[0,0])
sns.boxplot(df['Glucose'],ax=axes[0,1])
sns.boxplot(df['BloodPressure'],ax=axes[1,0])
sns.boxplot(df['SkinThickness'],ax=axes[1,1])
sns.boxplot(df['Insulin'],ax=axes[2,0])
sns.boxplot(df['BMI'],ax=axes[2,1])
sns.boxplot(df['DiabetesPedigreeFunction'],ax=axes[3,0])
sns.boxplot(df['Age'],ax=axes[3,1])
plt.show()


*Clearly our dataset has outliers in it. Except glucose all other columns have outliers present in them. But we will keep these outliers and study them later because there is a possibility of other medical conditions or patient specific cases. *

In [None]:
#Let's check out pairplot for the data
sns.pairplot(df, hue ='Outcome')

**This plot shows that parameters of the data has some correlation between the parameters since blue and orange balls are overlapping each other. Let us find out the find correlation between all the variables to ensure there is no multicollinearity when we create model. We will do this using a correlation matrix.**

# **3. Feature Selection**

In [None]:
#creating correlation matrix
corr=df.corr()
corr

In [None]:
#Heatmap
sns.heatmap(corr,annot=True)

As we can see from correlation matrix, there are two pairs of variables who have high correlation - Age:Pregnancies and BMI:Skin Thickness. The cut off for significant correlation levels for multicollinearity as known in the literature is 0.7 generally. But we will examine the correlation levels above 0.5 to eliminate unneeded variables. This reduces the underlying scale of multicollinearity while also increasing the efficiency or speed of the model while cutting down data acquisition costs in the final product.

* ***Age to Pregnancies*** is going to show high correlation (0.54) since number of pregnancies so far are expected to grow with age of females. But note that this is only the case for females and not for males. So we shall keep both these variables.

* ***BMI to Skin Thickness*** are also highly correlated (0.54) because thickness of skin directly adds to weight of the patient but does not contribute to height of the patient. Since BMI is weight/height^2, we will be *dropping* Skin Thickness variable which has lower correlation with the outcome. ([External information](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3897752/))

With this, our exploratory analysis is over. We shall refresh the dataframe by loading data once again for modelling to avoid any data leakages between test and train datasets.

# **4. Modelling**

In [None]:
#Reload the data
df=pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df

In [None]:
#We split first to avoid any data leakage later
from sklearn.model_selection import train_test_split
train,test=train_test_split(df,test_size=0.20,random_state=2,stratify=df['Outcome'])
train_X=train.drop(columns=['Outcome'])
test_X=test.drop(columns=['Outcome'])
train_Y=train['Outcome']
test_Y=test['Outcome']

#As explored in section 2. Cleaning, we will replace the 0s in specific columns with median (as they cannot be 0 in real world scenarios)
train_X[['Glucose','BloodPressure','Insulin','BMI','SkinThickness']] = train_X[['Glucose','BloodPressure','Insulin','BMI','SkinThickness']].replace(0,np.NaN)
test_X[['Glucose','BloodPressure','Insulin','BMI','SkinThickness']] = test_X[['Glucose','BloodPressure','Insulin','BMI','SkinThickness']].replace(0,np.NaN)
for C in ['Glucose','BloodPressure','Insulin','BMI','SkinThickness']:
    train_X[C].fillna(df[C].median(), inplace=True)
    test_X[C].fillna(df[C].median(), inplace=True)

We shall normalize only the training data

In [None]:
from sklearn import preprocessing
#Normalization (standard scaling)
scaler = preprocessing.StandardScaler()
normalized_train_X=scaler.fit_transform(train_X)
normalized_test_X=scaler.transform(test_X)

In [None]:
pd.DataFrame(normalized_train_X)

In [None]:
pd.DataFrame(normalized_test_X)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
#Trial fit
forest_model = RandomForestClassifier(n_estimators=30, random_state=1, n_jobs=-1)
forest_model.fit(normalized_train_X, train_Y)

In [None]:
forest_model.score(normalized_test_X,test_Y)

In [None]:
d = 0
for estimator in forest_model.estimators_:
    d = d + estimator.get_depth()
int(d / len(forest_model.estimators_))  #### The average depth of all the decision trees inside the forest ensemble

In [None]:
#We will prune our random forest by 10% of the existing average depth
d = (d / len(forest_model.estimators_))*0.9

Performing 5-fold Cross Validation on training set to calculate the best forest model.