# Diabetes Dataset

## --> Create Dataframe

In [None]:
import pandas as pd

url='https://drive.google.com/file/d/1mfqBb3LV4vUQzyMf6nEyTq_1QCAJ2wk5/view?usp=share_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]

df = pd.read_csv(url)
df

## 1) Preprocess Dataset

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

Except for pregnancies and insulin level, any of the other parameters (like skin thickness) cannot be zero

### --> Check how many such rows have 0

In [None]:
problem_df = df[(df['Glucose']==0)|(df['BloodPressure']==0)|(df['SkinThickness']==0)|(df['BMI']==0)]
problem_df

236 of the 736 rows have incorrect values. As they make up for approx. 30% of the dataset, removing or replacing them will greatly<br>affect the prediction accuracy. We will try both and see which one yields more accuracy

### 1A) Replace the zeroes in the 4 columns with their respective means

In [None]:
df_rep = df.copy()   # create new dataframe in which the zero values will be replaced

columns = ['Glucose','BloodPressure', 'SkinThickness', 'BMI']
for col in columns:
    val = df[col].mean()
    df_rep[col] = df[col].replace(0, val)

In [None]:
df_rep.describe()

In [None]:
df_rep.shape

### 1B) Remove rows with zeroes entirely

In [None]:
df_rem = df.copy()  # create new dataframe in which zero entries are removed entirely

df_rem = df[(df['Glucose']!=0) & (df['BloodPressure']!=0) & (df['SkinThickness']!=0) & (df['BMI']!=0)]
df_rem.describe()

In [None]:
df_rem.shape

## 2) KNN Prediction

We will have to execute this step twice, once for<br>A) df_rep (replaced) and once for<br>B) df_rem (removed)

### 2A) Df_rep

In [None]:
# 1) Separate
X_rep = df_rep.iloc[:, :8].values   # Independent
y_rep = df_rep['Outcome'].values    # Dependent

In [None]:
# 2) Split

from sklearn.model_selection import train_test_split as tts
X_train_rep, X_test_rep, y_train_rep, y_test_rep = tts(X_rep, y_rep, test_size=0.2)

In [None]:
# 3) Scale

'''
If we do not normalize the data, the features with higher
values could be considered more important by the algorithm,
resulting in an imperfect prediction result.
'''

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train_rep = sc.fit_transform(X_train_rep)
X_test_rep = sc.fit_transform(X_test_rep)

In [None]:
# 4) Train & Predict

# We will find the optimal value of 'k' by testing it for
# values from 1 to 100

from sklearn.neighbors import KNeighborsClassifier

In [None]:
test_scores = []

for i in range(1, 101):

    knn_model_rep = KNeighborsClassifier(i)
    knn_model_rep.fit(X_train_rep, y_train_rep.ravel())
    
    test_scores.append(knn_model_rep.score(X_test_rep, y_test_rep))

In [None]:
import matplotlib.pyplot as plt

def best_k_value_plot():
    plt.figure(figsize = (18, 9))
    plt.plot(range(1,101), test_scores, color='red', linestyle='dashed', marker='o',
             markerfacecolor='blue', markersize=10)
    plt.title('Best k value is??')
    plt.xlabel('k Value')
    plt.ylabel('Score')

best_k_value_plot()

In [None]:
knn_model_rep = KNeighborsClassifier(27)
knn_model_rep.fit(X_train_rep, y_train_rep.ravel())
knn_pred_rep = knn_model_rep.predict(X_test_rep)

### 2B) Df_rem

In [None]:
X_rem = df_rem.iloc[:, :8].values   # Independent  # 1) Separate
y_rem = df_rem['Outcome'].values    # Dependent

X_train_rem, X_test_rem, y_train_rem, y_test_rem = tts(X_rem, y_rem, test_size=0.2)  # 2) Split

X_train_rem = sc.fit_transform(X_train_rem)  # 3) Scale
X_test_rem = sc.fit_transform(X_test_rem)

test_scores = []                             # 4) Train and Predict

for i in range(1, 101):                       # Find best value for k, similar to 2A

    knn_model_rem = KNeighborsClassifier(i)
    knn_model_rem.fit(X_train_rem, y_train_rem.ravel())
    
    test_scores.append(knn_model_rem.score(X_test_rem, y_test_rem))

best_k_value_plot()

In [None]:
knn_model_rem = KNeighborsClassifier(25)
knn_model_rem.fit(X_train_rem, y_train_rem.ravel())
knn_pred_rem = knn_model_rem.predict(X_test_rem)

## 3) Accuracy Metrics

### i] Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import tensorflow as tf
import seaborn as sn
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, pltitle):

    plt.figure(figsize=(10,7))
    sn.heatmap(cm, annot=True, fmt='d')
    plt.title(pltitle, fontsize=18)
    plt.xlabel('Predicted')
    plt.ylabel('Actuals')

In [None]:
conf_matrix_rep = tf.math.confusion_matrix(labels=y_test_rep, predictions=knn_pred_rep) # A) Replaced
conf_matrix_rem = tf.math.confusion_matrix(labels=y_test_rem, predictions=knn_pred_rem) # B) Removed

In [None]:
plot_confusion_matrix(conf_matrix_rep, 'Confusion Matrix Replaced')

In [None]:
plot_confusion_matrix(conf_matrix_rem, 'Confusion Matrix Removed')

### ii] Accuracy, Precision and Recall

In [None]:
from sklearn.metrics import classification_report as clarep
clarep_rep = clarep(y_test_rep, knn_pred_rep)
clarep_rem = clarep(y_test_rem, knn_pred_rem)
print(f'Replaced:-\n{clarep_rep}')
print(f'Removed:-\n{clarep_rem}')

### iii] Error Rate

In [None]:
from sklearn import metrics

In [None]:
knn_rep_err = (100 - metrics.accuracy_score(y_test_rep, knn_pred_rep)*100)
knn_rem_err = (100 - metrics.accuracy_score(y_test_rem, knn_pred_rem)*100)
print(f'KNN error replaced = {round(knn_rep_err, 2)}%') 
print(f'KNN error removed = {round(knn_rem_err, 2)}%')

### iv] RMSE

In [None]:
import numpy as np

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
knn_acc_rem = np.sqrt(mean_squared_error(y_test_rem, knn_pred_rem))
knn_acc_rep = np.sqrt(mean_squared_error(y_test_rep, knn_pred_rep))

In [None]:
print(f'KNN prediction Accuracy with removed values = {knn_acc_rem}')
print(f'KNN prediction Accuracy with replaced values = {knn_acc_rep}')