In [None]:
#Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # for data visualization
plt.style.use('ggplot')

In [None]:
#Load the dataset
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

#Print the first 5 rows of the dataframe.
df.head()

In [None]:
#shape of Dataset
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
correlation=df.corr()
correlation['Outcome'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10,8))
plt.title('Correlation of Attributes with Outcome Attribute')
a=sns.heatmap(correlation, square=True, annot=True, fmt='.2f', linecolor='white')
a.set_xticklabels(a.get_xticklabels())
a.set_yticklabels(a.get_yticklabels())
plt.show()

In [None]:
X=df.drop(['Outcome'],axis=1)
y=df['Outcome']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
X_train.shape,X_test.shape

In [None]:
y_train.count(),y_test.count()

In [None]:
from  sklearn.neighbors import KNeighborsClassifier

neighbors=np.arange(1,30)
train_accuracy=np.empty(len(neighbors))
test_accuracy=np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    
    knn=KNeighborsClassifier(n_neighbors=k)
    
    knn.fit(X_train,y_train)
    
    train_accuracy[i]=knn.score(X_train,y_train)
    
    test_accuracy[i]=knn.score(X_test,y_test)


In [None]:

plt.title('KNN scores with diffrent K values')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
knn= KNeighborsClassifier(n_neighbors=23)


In [None]:
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)

In [None]:
from sklearn import metrics
metrics.plot_roc_curve(knn,X_test,y_test)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
print('accuracy_score:{0}'.format(round(accuracy_score(y_test,y_pred),2)))

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors':np.arange(1,50)}

In [None]:
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X,y)

In [None]:
knn_cv.best_score_

In [None]:
knn_cv.best_params_