# Diabetes Predication 

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.describe()

#### Finding outliers

In [None]:
def BoxPlot(x , y, axis):
    """
    This function return Box plot of give data with respect Outcome means 
    0 : Not Diabetes 
    1 : Diabetes
    
    
    """
    return sns.boxplot(x = x, 
                       y = y,
                       ax = axis,
                       data = data)

In [None]:
fig, ax = plt.subplots(4, 2, figsize=(15, 15))
ax1 = ax[0, 0]
ax2 = ax[0, 1]
ax3 = ax[1, 0]
ax4 = ax[1, 1]
ax5 = ax[2, 0]
ax6 = ax[2, 1]
ax7 = ax[3, 0]
ax8 = ax[3, 1]
BoxPlot('Outcome','Glucose',ax1)
BoxPlot('Outcome','BloodPressure',ax2)
BoxPlot('Outcome','SkinThickness',ax3)
BoxPlot('Outcome','Insulin',ax4)
BoxPlot('Outcome','BMI',ax5)
BoxPlot('Outcome','DiabetesPedigreeFunction',ax6)
BoxPlot('Outcome','Age',ax7)
BoxPlot('Outcome','Pregnancies',ax8)
plt.show()

In [None]:
def Outliers(col):
    data_mean, data_std = np.mean(col), np.std(col)
    # identify outliers
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    # identify outliers
    outliers = [x for x in col if x < lower or x > upper]

    return len(outliers)

In [None]:
print('Outliers in Glucose :',Outliers(data['Glucose']))
print('Outliers in BloodPressure :',Outliers(data['BloodPressure']))
print('Outliers in SkinThickness :',Outliers(data['SkinThickness']))
print('Outliers in BMI :',Outliers(data['BMI']))
print('Outliers in DiabetesPedigreeFunction :',Outliers(data['DiabetesPedigreeFunction']))
print('Outliers in Age :',Outliers(data['Age']))
print('Outliers in Pregnancies :',Outliers(data['Pregnancies']))

Their are very few outliers so we can ignore them. 

# EDA

In [None]:
def ScatterPlot(x , y, axis):
    """
    This function return Scatter plot of give data with respect Outcome means 
    0 : Not Diabetes 
    1 : Diabetes
    
    
    """
    return sns.scatterplot(x = x, 
                           y = y,
                           hue = 'Outcome',
                           ax = axis,
                           data = data)

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(15, 15))
ax1 = ax[0, 0]
ax2 = ax[0, 1]
ax3 = ax[1, 0]
ax4 = ax[1, 1]
ax5 = ax[2, 0]
ax6 = ax[2, 1]
ScatterPlot('Glucose','Age',ax1)
ScatterPlot('BloodPressure','Age',ax2)
ScatterPlot('SkinThickness','Age',ax3)
ScatterPlot('Insulin','Age',ax4)
ScatterPlot('BMI','Age',ax5)
ScatterPlot('DiabetesPedigreeFunction','Age',ax6)
plt.show()

#### Observation of above plots : 

1. As the 1st plot of 'Age vs Glucose level' as glucose level increase more ladies have diabetes
2. In next 4 plot main observation is as age increase there are more chances of having diabetes
3. In last plot ladies who's age is high and diabetes pedigree function more than 0.5 have more chances of diabetes

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(), square=True, cmap='coolwarm', annot=True)

In [None]:
def OutcomeAnalysis(col_name,axis):
    return data.groupby('Outcome')[col_name].mean().sort_values().plot(kind='bar', color='coral',ax = axis)

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(15, 15))
ax1 = ax[0, 0]
ax2 = ax[0, 1]
ax3 = ax[1, 0]
ax4 = ax[1, 1]
ax5 = ax[2, 0]
ax6 = ax[2, 1]
OutcomeAnalysis('Glucose',ax1)
ax1.set_title('Avg.Glucose level')
OutcomeAnalysis('BloodPressure',ax2)
ax2.set_title('Avg.BloodPressure')
OutcomeAnalysis('SkinThickness',ax3)
ax3.set_title('Avg.SkinThickness')
OutcomeAnalysis('Insulin',ax4)
ax4.set_title('Avg.Insulin')
OutcomeAnalysis('BMI',ax5)
ax5.set_title('Avg.BMI')
OutcomeAnalysis('DiabetesPedigreeFunction',ax6)
ax6.set_title('Avg.DiabetesPedigreeFunction')
plt.show()

#### Observation of above plots :
* . Average Insulin value is high for ladies who have diabetes.
* . Avrage Diabetes Pedigree function is high for ladies who have diabetes.

In [None]:
fig, ax = plt.subplots(4, 2, figsize=(15, 15))
dp=sns.distplot(data['Pregnancies'],ax=ax[0, 0])
dp=sns.distplot(data['Glucose'],ax=ax[0, 1])
dp=sns.distplot(data['BloodPressure'],ax=ax[1, 0])
dp=sns.distplot(data['SkinThickness'],ax=ax[1, 1])
dp=sns.distplot(data['Insulin'],ax=ax[2, 0])
dp=sns.distplot(data['BMI'],ax=ax[2, 1])
dp=sns.distplot(data['DiabetesPedigreeFunction'],ax=ax[3, 0])
dp=sns.distplot(data['Age'],ax=ax[3, 1])
plt.show()

#### Observation of above plots :

* . Glocose, BloodPressure and BMI is normally distributed.
* . Pregnancies, Insulin, DiabetesPedigreeFunction and Age is right-screwed in nature.


###### Data needs to normalize.

#### Data Pre-processing

In [None]:
X = data.drop(['Outcome'],axis = 1)
y = data['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
sc = StandardScaler()

In [None]:
X2_train = sc.fit_transform(X_train)
X2_test = sc.fit_transform(X_test)
y2_train = y_train
y2_test = y_test

#### Model Implementation

In [None]:
def OptimalKNN(X_train, X_test, y_train, y_test):
    max_k = 50
    f1_scores = list()
    error_rates = list() # 1-accuracy

    for k in range(1, max_k):
    
        knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
        knn = knn.fit(X_train, y_train)
    
        y_pred = knn.predict(X_test)
        f1 = f1_score(y_pred, y_test)
        f1_scores.append((k, round(f1_score(y_test, y_pred), 4)))
        error = 1-round(accuracy_score(y_test, y_pred), 4)
        error_rates.append((k, error))
    
    f1_results = pd.DataFrame(f1_scores, columns=['K', 'F1 Score'])
    error_results = pd.DataFrame(error_rates, columns=['K', 'Error Rate'])
    
    return f1_results, error_results

In [None]:
f1, error = OptimalKNN(X_train, X_test, y_train, y_test)
f1_scale, error_scale = OptimalKNN(X2_train, X2_test, y2_train, y2_test)

In [None]:
def PlotKNN(accuracies,y_axis):
    ax = accuracies.set_index('K').plot(figsize=(8,8))
    ax.set(xlabel='K', ylabel=y_axis)
    plt.title('KNN '+y_axis)
    plt.show()

In [None]:
Error_Rate = 'Error Rate'
f1_score = 'F1 Score'
PlotKNN(f1, f1_score)

In [None]:
PlotKNN(error, Error_Rate)

In [None]:
PlotKNN(f1_scale, f1_score)

In [None]:
PlotKNN(error_scale, Error_Rate)