In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df.head()

In [None]:
# initial information about data
df.info()

In [None]:
# initial statistic about data
df.describe().T

In [None]:
# checking null value
df.isnull().sum()

No null or missing value

In [None]:
# checking duplicate data
df.duplicated().sum()

No duplicated data

## Data Preprocessing

In [None]:
# check Outcome column
df['Outcome'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.countplot(df['Outcome'])

In [None]:
df.hist(figsize=(10,10))

There are multiple columns with 0 as their value and it makes no sense

In [None]:
print('Glucose: ', df[df['Glucose'] == 0]['Glucose'].count())
print('BloodPressure: ', df[df['BloodPressure'] == 0]['BloodPressure'].count())
print('SkinThickness: ', df[df['SkinThickness'] == 0]['SkinThickness'].count())
print('Insulin: ', df[df['Insulin'] == 0]['Insulin'].count())
print('BMI: ', df[df['BMI'] == 0]['BMI'].count())

We can change 0 in those columns with the mean of each column using imputation technique

In [None]:
# change 0 into Nan (empty)
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0, np.NaN)

In [None]:
df.isnull().sum()

Now that the values of these columns are empty, we can fill them in with the mean of the column using the imputation technique using the Outcome class as the benchmark.

### Checking the mean of each column which has NaN value

In [None]:
df.groupby('Outcome')['Glucose'].describe()

In [None]:
df.groupby('Outcome')['BloodPressure'].describe()

In [None]:
df.groupby('Outcome')['SkinThickness'].describe()

In [None]:
df.groupby('Outcome')['Insulin'].describe()

In [None]:
df.groupby('Outcome')['BMI'].describe()

### Imputation Function For Each Column

In [None]:
# imputation Function for Glucose
def impute_glucose(cols):
    glucose = cols[0]
    outcome = cols[1]
    
    if pd.isnull(glucose):
        if outcome == 0:
            return 111
        else:
            return 142
    else:
        return glucose

In [None]:
# imputation function for bloodpreassure column
def impute_bloodpreasure(cols):
    bloodpreasure = [0]
    outcome = [1]
    
    if pd.isnull(bloodpreasure):
        if outcome == 0:
            return 71
        else:
            return 75
    else:
        return bloodpreasure

In [None]:
# imputation function for skinthickness column
def impute_skinthickness(cols):
    skinthickness = [0]
    outcome = [1]
    
    if pd.isnull(skinthickness):
        if outcome == 0:
            return 27
        else:
            return 33
    else:
        return skinthickness

In [None]:
# imputation function for insulin column
def impute_insulin(cols):
    insulin = [0]
    outcome = [1]
    
    if pd.isnull(insulin):
        if outcome == 0:
            return 130
        else:
            return 207
    else:
        return insulin

In [None]:
# imputation function for bmi column
def impute_bmi(cols):
    bmi = [0]
    outcome = [1]
    
    if pd.isnull(bmi):
        if outcome == 0:
            return 31
        else:
            return 35
    else:
        return bmi

In [None]:
# applying imputation function
df['Glucose'] = df[['Glucose', 'Outcome']].apply(impute_glucose, axis=1)
df['BloodPressure'] = df[['BloodPressure', 'Outcome']].apply(impute_glucose, axis=1)
df['SkinThickness'] = df[['SkinThickness', 'Outcome']].apply(impute_glucose, axis=1)
df['Insulin'] = df[['Insulin', 'Outcome']].apply(impute_glucose, axis=1)
df['BMI'] = df[['BMI', 'Outcome']].apply(impute_glucose, axis=1)

In [None]:
df.isnull().sum()

In [None]:
df

In [None]:
df.describe().T

### Checking Outliers

In [None]:
# checking outliers
plt.figure(figsize=(12,7))
sns.boxplot(data=df)

We can see that there are multiple outliers across multiple columns, and we need to delete them

In [None]:
# remove Outliers using Interquartile Range
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
# clearing the outliers
df_filtered = df.loc[~((df < (Q1 - 1.5 * IQR)).any(axis=1)|(df > (Q3 + 1.5 * IQR)).any(axis=1))].dropna()
df_filtered.reset_index(drop=True, inplace=True)
df_filtered.head()

In [None]:
plt.figure(figsize=(15,9))
sns.boxplot(data=df_filtered)

### Data Normalization

In [None]:
X = df_filtered.drop('Outcome', axis=1)
y = df_filtered['Outcome']

#### Using Min Max Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(X)

In [None]:
X = scaler.transform(X)

In [None]:
X = pd.DataFrame(X)
X.columns=[df_filtered.columns[:8]]
X.head()

## Train and Test Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## Training Model

### KNN 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
error_rate = []

for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    knn_prediction = knn.predict(X_test)
    error_rate.append(np.mean(knn_prediction != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value', fontsize=30, color='black', pad=15, loc='center')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
from sklearn.metrics import roc_curve
y_pred_proba = knn.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

In [None]:
plt.figure(figsize=(10,6))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.show()

In [None]:
#Area under ROC curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_proba)

In [None]:
#import GridSearchCV
from sklearn.model_selection import GridSearchCV
#In case of classifier like knn the parameter to be tuned is n_neighbors
param_grid = {'n_neighbors':np.arange(1,40)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=10)
knn_cv.fit(X,y)

print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))

### Decision Tree

In [None]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier(min_impurity_split=0.001)
dtc.fit(X_train, y_train)
dtc_prediction = dtc.predict(X_test)

In [None]:
# Import Module
from sklearn import metrics

# Show the Confussion Matrix
print(metrics.confusion_matrix(y_test, dtc_prediction))
print(metrics.classification_report(y_test, dtc_prediction))

In [None]:
# Show the Accuracy, Precision, Recall
acc_dtc = metrics.accuracy_score(y_test, dtc_prediction)
prec_dtc = metrics.precision_score(y_test, dtc_prediction)
rec_dtc = metrics.recall_score(y_test, dtc_prediction)
f1_dtc = metrics.f1_score(y_test, dtc_prediction)
kappa_dtc = metrics.cohen_kappa_score(y_test, dtc_prediction)

print("Accuracy:", acc_dtc)
print("Precision:", prec_dtc)
print("Recall:", rec_dtc)
print("F1 Score:", f1_dtc)
print("Cohens Kappa Score:", kappa_dtc)

In [None]:
# Import Visualization Package
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Set Size and Style
plt.rcParams['figure.figsize'] = (10, 10)
plt.style.use('ggplot')

# Visualize ROC Curve
dtc_prediction_proba = dtc.predict_proba(X_test)[::,1]
fprdtc, tprdtc, _ = metrics.roc_curve(y_test,  dtc_prediction_proba)
aucdtc = metrics.roc_auc_score(y_test, dtc_prediction_proba)
plt.plot(fprdtc,tprdtc,label="Decision Tree, auc="+str(aucdtc))
plt.title('ROC Curve - Decision Tree')
plt.xlabel('false positive rate') 
plt.ylabel('true positive rate')
plt.legend(loc=4)
plt.show()

### Random Forest

In [None]:
# Import Module
from sklearn import ensemble

# Modeling Random Forest
rdf = ensemble.RandomForestClassifier(n_estimators=600)
rdf.fit(X_train, y_train)

# Predict to Test Data 
rdf_prediction = rdf.predict(X_test)

In [None]:
# Import Module
from sklearn import metrics

# Show the Confussion Matrix
print(metrics.confusion_matrix(y_test, rdf_prediction))
print(metrics.classification_report(y_test, rdf_prediction))

In [None]:
# Show the Accuracy, Precision, Recall
acc_rdf = metrics.accuracy_score(y_test, rdf_prediction)
prec_rdf = metrics.precision_score(y_test, rdf_prediction)
rec_rdf = metrics.recall_score(y_test, rdf_prediction)
f1_rdf = metrics.f1_score(y_test, rdf_prediction)
kappa_rdf = metrics.cohen_kappa_score(y_test, rdf_prediction)

print("Accuracy:", acc_rdf)
print("Precision:", prec_rdf)
print("Recall:", rec_rdf)
print("F1 Score:", f1_rdf)
print("Cohens Kappa Score:", kappa_rdf)

In [None]:
# ROC Curve
rdf_prediction_proba = rdf.predict_proba(X_test)[::,1]
fprrdf, tprrdf, _ = metrics.roc_curve(y_test,  rdf_prediction_proba)
aucrdf = metrics.roc_auc_score(y_test, rdf_prediction_proba)
plt.plot(fprrdf,tprrdf,label="Random Forest, auc="+str(aucrdf))
plt.title('ROC Curve - Random Forest')
plt.xlabel('false positive rate') 
plt.ylabel('true positive rate')
plt.legend(loc=4)
plt.show()

Random Forest model is the best model to do classification for this dataset with auc in roc curve = 95