In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import f1_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**IMPORTING DATASET**

In [None]:
df=pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
df.head()

**DATA PREPROCESSING**

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

As Glucose,BloodPressure,SkinThickness,Insulin,BMI these columns have 0 as its values. We'll clean some of our dependent variables, replacing the 0's in the features with their mean.

In [None]:
cols_with_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in cols_with_zero:
    df[col] = df[col].replace(0, np.NaN)
    median = int(df[col].median(skipna= True))
    df[col] = df[col].replace(np.NaN, median)

In [None]:
df.head()

In [None]:
df.isnull().sum()

**DATA VISUALISATION**

Finding the correlations between the different features present in our dataset.

In [None]:
corr=df.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(corr,annot=True)

In [None]:
plt.figure(figsize=(30,10))
sns.countplot(df["SkinThickness"])

In [None]:
df_columns=["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]

We use it forbreaking data variables up across multiple subplots and combining those subplots into a single figure.

In [None]:
for i in df_columns:
    g = sns.FacetGrid(df, col='Outcome')
    g = g.map(sns.kdeplot, i)

In [None]:
sns.set_style="whitegrid"
sns.pairplot(df,hue="Outcome",palette="coolwarm")

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x="Insulin",y="BMI",hue="Outcome",data=df)

In [None]:
plt.subplot(111)
sns.distplot(df['Age'], bins=10, kde=True)
plt.show()

plt.subplot(121)
sns.distplot(df["Glucose"],bins=10,kde=True)
plt.show()

plt.subplot(131)
sns.distplot(df["BloodPressure"],bins=10,kde=True)
plt.show()

In [None]:
sns.stripplot(x='Outcome', y="Age" , data=df)

In [None]:
sns.stripplot(x='Outcome', y="Glucose" , data=df)

Splitting dataset into independent and dependent dataset

In [None]:
X=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

**SPLITTING DATASET INTO TRAIN TEST AND TEST SET**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

**FEATURE SCALING**

For making all the features to come in a similar range

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

**TRAINING THE DATASET ON THE K-NN MODEL**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 11, metric = 'euclidean', p = 2)
classifier.fit(X_train, y_train)

**PREDICTING THE TEST SET RESULT**

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

**MAKING THE CONFUSION MATRIX**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
print(f1_score(y_test,y_pred))

**WE GOT AN ACCURACY OF 81% AND F1_SCORE 68% WHICH IS NOT VERY GOOD BUT SATISFACTORY**