In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
#for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for preprocessing and EDA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#for ml models:
from sklearn.neighbors import KNeighborsClassifier

#for ml metrices:
from sklearn.metrics import confusion_matrix

In [None]:
data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
sns.countplot(x='Outcome',data=data)

In [None]:
data_copy = data.copy(deep=True)
data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
data_copy.isnull().sum()

In [None]:
p =data.hist(figsize=(20,20))

In [None]:
data_copy['Glucose'].fillna(data_copy['Glucose'].mean(), inplace = True)
data_copy['BloodPressure'].fillna(data_copy['BloodPressure'].mean(), inplace = True)
data_copy['SkinThickness'].fillna(data_copy['SkinThickness'].median(), inplace = True)
data_copy['Insulin'].fillna(data_copy['Insulin'].median(), inplace = True)
data_copy['BMI'].fillna(data_copy['BMI'].median(), inplace = True)

In [None]:
data_copy.isnull().sum()

In [None]:
import missingno as msno
p=msno.bar(data_copy)

In [None]:
p=sns.pairplot(data_copy, hue = 'Outcome')


In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(data_copy.corr(),annot=True,cmap ='RdYlGn')

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(data.corr(),annot=True,cmap ='RdYlGn')
plt.figure(figsize=(6,4))
sns.heatmap(data_copy.corr(),annot=True,cmap ='RdYlGn')

In [None]:
ss= StandardScaler()
X =  pd.DataFrame(ss.fit_transform(data_copy.drop(["Outcome"],axis = 1),),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])

In [None]:
X.head()

In [None]:
y = data_copy.Outcome
y

In [None]:
train_x,test_x,train_y,test_y = train_test_split(data_copy,y,test_size=0.30,random_state=42, stratify=y)

In [None]:
test_scores = []
train_scores = []
for i in range(1,15):
    knn  = KNeighborsClassifier(i)
    knn.fit(train_x,train_y)
    
    train_scores.append(knn.score(train_x,train_y))
    test_scores.append(knn.score(test_x,test_y))

In [None]:
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_scores,marker='o',label='Test Score')

In [None]:
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(31)

knn.fit(train_x,train_y)
knn.score(test_x,test_y)

In [None]:
y_pred =knn.predict(test_x)
confusion_matrix(test_y,y_pred)
pd.crosstab(test_y, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_y,y_pred))

In [None]:
from sklearn.metrics import roc_curve
y_pred_proba = knn.predict_proba(test_x)[:,1]
fpr, tpr, thresholds = roc_curve(test_y, y_pred_proba)

In [None]:
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=10) ROC curve')
plt.show()

In [None]:
#Area under ROC curve
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y,y_pred_proba)

In [None]:
from sklearn.model_selection import GridSearchCV
#In case of classifier like knn the parameter to be tuned is n_neighbors
param_grid = {'n_neighbors':np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(data_copy,y)

print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))

In [None]:
submission = pd.DataFrame({
        "predicted": y_pred
    })
submission.to_csv('submission.csv', index=False)