## 1. Import Libaries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pandas.plotting import scatter_matrix
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 2. Data Explore

In [None]:
main_df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
main_df.head()

In [None]:
main_df.describe().T

In [None]:
# min value for Zero means there is missing value.
# replace zero with Nan values to know how many such values are present.
main_df_copy = main_df.copy(deep=True)
main_df_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = main_df_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0,np.NaN)
print(main_df_copy.isnull().sum())

In [None]:
p = main_df.hist(figsize=(10,10))

In [None]:
# To remove Nan values, filling with mean, median and mode.
main_df_copy['Glucose'].fillna(main_df_copy['Glucose'].mean(), inplace=True)
main_df_copy['BloodPressure'].fillna(main_df_copy['BloodPressure'].mean(), inplace = True)
main_df_copy['SkinThickness'].fillna(main_df_copy['SkinThickness'].median(), inplace = True)
main_df_copy['Insulin'].fillna(main_df_copy['Insulin'].median(), inplace = True)
main_df_copy['BMI'].fillna(main_df_copy['BMI'].mean(), inplace = True)

In [None]:
p = main_df_copy.hist(figsize=(10,10))

In [None]:
# Count
print(main_df['Outcome'].value_counts())
main_df['Outcome'].value_counts().plot(kind='bar')

In [None]:
# Scatter matrix of Uncleaned data
from pandas.plotting import scatter_matrix
p = scatter_matrix(main_df, figsize=(10,10))

In [None]:
# pair plot of Clean data
p = sns.pairplot(main_df_copy, hue='Outcome')

In [None]:
# Heatmap
plt.figure(figsize=(12,10))
p = sns.heatmap(main_df_copy.corr(), annot=True)

In [None]:
# Scaling Data
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = pd.DataFrame(sc_X.fit_transform(main_df_copy.iloc[:,:-1]),columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age'])
X.head()

In [None]:
# Target
y = main_df_copy.Outcome

## 3. Train - Test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

train_score = []
test_score = []

# run for k = 1 to 15 range
for i in range(1,15):
    knn = KNeighborsClassifier(i)
    knn.fit(X_train, y_train)
    
    train_score.append(knn.score(X_train, y_train))
    test_score.append(knn.score(X_test, y_test))

In [None]:
# Max KNN score
max_train_score = max(train_score)
train_score_index = [index for index,value in enumerate(train_score) if value==max_train_score]
print('Max train score: {} for k{}'.format(round(max_train_score*100, 2), list(map(lambda x:x+1, train_score_index))))

In [None]:
max_test_score = max(test_score)
test_score_index = [index for index,value in enumerate(test_score) if value==max_test_score]
print('Max train score: {} for k{}'.format(round(max_test_score*100, 2), list(map(lambda x:x+1, test_score_index))))

## 4. Result Visualization

In [None]:
plt.figure(figsize=(10,5))
p = sns.lineplot(range(1,15), train_score, marker='*', label='Train Score')
p = sns.lineplot(range(1,15), test_score, marker='*', label='Test Score')

In [None]:
# Bets results captured at k=5
knn = KNeighborsClassifier(5)

knn.fit(X_train, y_train)
knn.score(X_test, y_test)

## 5. Model Performance Analysis.

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = knn.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True)
plt.title('Confusion matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label');

### Classification Report
Precision, Recall, and F1-Score

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

ROC Curve - AUC

In [None]:
from sklearn.metrics import roc_curve
y_pred_prob = knn.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

In [None]:
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr, tpr, label='knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('knn ROC Curve')
plt.show()

In [None]:
# ROC score
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred_prob)

### Hyperparameters Tuning

In [None]:
# Grid Search CV

from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X,y)

print('Best Score:' + str(knn_cv.best_score_))
print('Best Parameters:' + str(knn_cv.best_params_))