In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **IMPORT RELEVANT LIBRARIES**

In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV

# **LOADING THE DATASET**

In [None]:
dataset = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
dataset.head()

# **ANALYZING THE DATASET**

In [None]:
dataset.describe()

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset.keys()

# **CHECK FOR MISSING VALUES**

In [None]:
dataset.isnull()

In [None]:
dataset.isnull().sum()

**There are no missing values**

# **SPLITTING INPUT AND TARGET VARIABLE**

In [None]:
#The target is the outcome column
#The input data is the rest of the columns except outcome
X = dataset.drop('Outcome', axis=1).values   #Input
y = dataset['Outcome'].values                #Target

# **TRAIN/TEST SPLIT**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# **IMPLEMENTING THE KNN CLASSIFIER WITH 7 NEIGHBORS**

In [None]:
# Create a k-NN classifier with 7 neighbors
knn = KNeighborsClassifier(n_neighbors = 7)

#Fit the classifier to the training data
knn.fit(X_train, y_train)

#Print the accuracy
print(knn.score(X_test, y_test))
print("Accuracy = {}".format(round(knn.score(X_test, y_test),2) * 100)+"%")

# **IMPLEMENTING THE KNN CLASSIFIER WITH K NEIGHBORS**

In [None]:
#Setup arrays to store train and test accuracies
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

In [None]:
#Loop over different values of k
for i, k in enumerate(neighbors):
    
    # Setup a k-NN Classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    #Fit the classifier to the training data
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)
    
    #Compute accuracy on the test set
    test_accuracy[i] = knn.score(X_test, y_test)

In [None]:
#Generate plot
_ = plt.title('KNN varying number of neighbors')
_ = plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
_ = plt.plot(neighbors, train_accuracy, label='Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

# **ANALYZING THE RESULTS**

In [None]:
# Predict the labels of the test data: y_pred
y_pred = knn.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

**PLOTTING AN ROC CURVE**

In [None]:
# Compute predicted probabilities
y_pred_prob = knn.predict_proba(X_test)[:,1]

# Generate ROC curve values (false positive rate, true positive rate, thresholds)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

**AUC COMPUTATION**

In [None]:
# Compute and print AUC score
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

# Compute cross-validated AUC scores: cv_auc
cv_auc = cross_val_score(knn, X, y, cv=5, scoring='roc_auc')

# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))

**HYPERPARAMETER TUNING (GridSearchCV and RandomizedSearchCV)**

In [None]:
##GridSearchCV
param_grid = {'n_neighbors': np.arange(1, 50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X, y)
print(knn_cv.best_params_)
print("Best score is {}".format(knn_cv.best_score_))