In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline

In [None]:
# Loading our dataset
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
#Checking the format of the data
df.head()

In [None]:
#To view some basic statistical details
df.describe()

In [None]:
df.info()

**No Null Values. We can proceed with our data analysis.**

In [None]:
sns.color_palette("hls", 8)
sns.pairplot(df,hue='Outcome')

In [None]:
#The distribution curve of Glucose wrt Outcome shows that there are less number of people with high Glucose level but they have higher chances of diabetes.

In [None]:
##The plots tells the following -
#Over the Pregnancy range, females with high glucose have Diabetes.
#As Insulin increase, and as Glucose, there are higher chances of Diabetes.
#As BMI increase, and as Glucose, there are higher chances of Diabetes.
#Age alone isn't really an indicator of Diabetes.

In [None]:
sns.heatmap(df.corr(), cmap='viridis')

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
scaler.fit(df.drop('Outcome', axis=1))

In [None]:
scaler_features = scaler.transform(df.drop('Outcome', axis=1))

In [None]:
df_feat = pd.DataFrame(scaler_features,columns=df.columns[:-1])
df_feat.head()

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaler_features,df['Outcome'],
                                                    test_size=0.30)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

In [None]:
#Now let's evaluate our model performance
#There are two widely used persormance metrics
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(confusion_matrix(y_test, pred))
print('\n')
print(classification_report(y_test, pred))

In [None]:
#Lets go ahead and choose an elbow methodto get correct k value
error_rate = []

# Will take some time
for i in range(1,50):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,50),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
# FIRST A QUICK COMPARISON TO OUR ORIGINAL K=1
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=1')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))


In [None]:
# NOW WITH K=17
knn = KNeighborsClassifier(n_neighbors=17)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=17')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
#All our metrices showing an improved model.