## Diabetes Prediction.
* This is a data set from kaggle which contains 769 records for individuals with cases of diabetes or not  
* The whole aim of this project is to make prediction using Machine Learning patterns it has recognized from the training feature.  
* This is a typical <b>Supervised Learning - Classification</b> Problem  
* 80% of the data will be used for training and the rest will be used for preprocessing  
* Proposed algorithms are as follows <b> K-Nearest Neigbour,  </b>
* We will doing the following  
    * EDA  
    * Data Splitting  
    * Data Preprocessing  (With both MinMaxScaler and StandardScaler)  
    * Data Training  
    * Data Prediction
    * Checking the confusion matrix  
    * Checking the accuracy of the data

In [48]:
import numpy as np
import pandas as pd
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


In [11]:
the_data=pd.read_csv('diabetes.csv')
the_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
the_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [13]:
#Checking the duplicated data
the_data.duplicated().sum()

0

In [14]:
ml_data=the_data.copy()

In [15]:
ml_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [16]:
not_zero_cols=['Glucose','BloodPressure','SkinThickness','Insulin','BMI'] #No value in this column should be Zero
for columns in not_zero_cols:
    ml_data[columns]=ml_data[columns].replace(0,np.NaN) #Replacing all 0s with NaN
    average=int(ml_data[columns].mean(skipna=True)) #Calculating the mean of all the record expect the ones with NaN
    ml_data[columns]=ml_data[columns].replace(np.NaN,average)

In [18]:
the_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
ml_data.head() #Check out the insulin records and compare it to that of the original data above

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [19]:
#determing which part of the dataset will be your input and output
input_features=ml_data.iloc[:,0:8]
results=ml_data.iloc[:,8]

In [20]:
input_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33


In [23]:
results.head().to_frame()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


## SPLITTING OF DATASET INTO TEST AND TRAINED DATA.

In [172]:
X_train,X_test,Y_train,Y_test= train_test_split(input_features,results,random_state=1,test_size=0.20)

In [173]:
X_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
285,7,136.0,74.0,26.0,135.0,26.0,0.647,51
101,1,151.0,60.0,29.0,155.0,26.1,0.179,22
581,6,109.0,60.0,27.0,155.0,25.0,0.206,27
352,3,61.0,82.0,28.0,155.0,34.4,0.243,46
726,1,116.0,78.0,29.0,180.0,36.1,0.496,25
...,...,...,...,...,...,...,...,...
563,6,99.0,60.0,19.0,54.0,26.9,0.497,32
318,3,115.0,66.0,39.0,140.0,38.1,0.150,28
154,8,188.0,78.0,29.0,155.0,47.9,0.137,43
684,5,136.0,82.0,29.0,155.0,32.0,0.640,69


In [174]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
663,9,145.0,80.0,46.0,130.0,37.9,0.637,40
712,10,129.0,62.0,36.0,155.0,41.2,0.441,38
161,7,102.0,74.0,40.0,105.0,37.2,0.204,45
509,8,120.0,78.0,29.0,155.0,25.0,0.409,64
305,2,120.0,76.0,37.0,105.0,39.7,0.215,29
...,...,...,...,...,...,...,...,...
645,2,157.0,74.0,35.0,440.0,39.4,0.134,30
715,7,187.0,50.0,33.0,392.0,33.9,0.826,34
72,13,126.0,90.0,29.0,155.0,43.4,0.583,42
235,4,171.0,72.0,29.0,155.0,43.6,0.479,26


In [175]:
Y_test

285    0
101    0
581    0
352    0
726    0
      ..
563    0
318    0
154    1
684    0
643    0
Name: Outcome, Length: 154, dtype: int64

In [176]:
Y_train

663    1
712    1
161    0
509    0
305    0
      ..
645    0
715    1
72     1
235    1
37     1
Name: Outcome, Length: 614, dtype: int64

## DATA PREPROCESSING (StandardScaler and MinMax)

In [177]:
#Normalizing with the MinMaxScaler in the Data Preprocessing stage
scalar=MinMaxScaler()

In [178]:
X_train_scaled=scalar.fit_transform(X_train) #transforming data to normalized state
X_test_scaled=scalar.transform(X_test) #transforming data to normalized state

In [179]:
X_train_scaled

array([[0.6       , 0.65584416, 0.57142857, ..., 0.40286299, 0.24632517,
        0.37254902],
       [0.66666667, 0.55194805, 0.3877551 , ..., 0.47034765, 0.15902004,
        0.33333333],
       [0.46666667, 0.37662338, 0.51020408, ..., 0.38854806, 0.05345212,
        0.47058824],
       ...,
       [0.86666667, 0.53246753, 0.67346939, ..., 0.51533742, 0.22227171,
        0.41176471],
       [0.26666667, 0.82467532, 0.48979592, ..., 0.5194274 , 0.17594655,
        0.09803922],
       [0.6       , 0.37662338, 0.53061224, ..., 0.3006135 , 0.25879733,
        0.49019608]])

In [180]:
X_test_scaled

array([[0.46666667, 0.5974026 , 0.51020408, ..., 0.1595092 , 0.25077951,
        0.58823529],
       [0.06666667, 0.69480519, 0.36734694, ..., 0.16155419, 0.04231626,
        0.01960784],
       [0.4       , 0.42207792, 0.36734694, ..., 0.1390593 , 0.05434298,
        0.11764706],
       ...,
       [0.53333333, 0.93506494, 0.55102041, ..., 0.60736196, 0.02360802,
        0.43137255],
       [0.33333333, 0.5974026 , 0.59183673, ..., 0.28220859, 0.24766147,
        0.94117647],
       [0.26666667, 0.2987013 , 0.48979592, ..., 0.200409  , 0.23429844,
        0.19607843]])

In [181]:
#Selecting the k-neighbours (Sqrt of the total of the test data result  and should be odd number)
math.sqrt(len(Y_test)) 

12.409673645990857

In [182]:
#11 will be the best for the k neighbours
knn=KNeighborsClassifier(n_neighbors=11,p=2)

In [183]:
#TRAINING DATA
knn.fit(X_train_scaled,Y_train)

In [184]:
y_pred=knn.predict(X_test_scaled)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

In [185]:
cm=confusion_matrix(Y_test,y_pred)
cm
#Result here shows the True Negative (TN),False Postive(FP),False Negative, True Postive 
#which is at indx TN(0,0),FP(0,1), FN(1,0) and TP(1,1) respectively
# [[TN,FP]
#  [FN,TP]]

array([[90,  9],
       [16, 39]], dtype=int64)

In [186]:
train_accuracy=knn.score(X_train_scaled,Y_train)
test_accuracy=knn.score(X_test_scaled,Y_test)
print(f'Train_Set accuracy= {train_accuracy}')
print(f'Test_Set accuracy= {test_accuracy}')

Train_Set accuracy= 0.7850162866449512
Test_Set accuracy= 0.8376623376623377
