In [76]:
import numpy as np
import pandas as pd
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Feature Engineering

# Loading Data

In [7]:
data=pd.read_csv("diabetes.csv")

In [4]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Since there is no null values and Catogoricl data we can further proceed the data for Feature Engineering process ie. Data Scaling, Splitting data.

# Data Scaling

In [84]:
x1=data.drop(["Outcome"],axis=1)
y=data["Outcome"]

In [85]:
scalar=MinMaxScaler()
x=scalar.fit_transform(x1)
x= pd.DataFrame(x,columns=x1.columns,index=x1.index)

In [86]:
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.352941,0.743719,0.590164,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.000000,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...,...
763,0.588235,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.117647,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.294118,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.058824,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


# Feature selection

In [87]:
cor = data.corr()
cor_target = abs(cor["Outcome"])
cor_features = cor_target[cor_target>0.1]
cor_features

Pregnancies                 0.221898
Glucose                     0.466581
Insulin                     0.130548
BMI                         0.292695
DiabetesPedigreeFunction    0.173844
Age                         0.238356
Outcome                     1.000000
Name: Outcome, dtype: float64

In [96]:
x=x.drop(['BloodPressure','SkinThickness'],axis=1)
x

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.352941,0.743719,0.000000,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.000000,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.000000,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.111111,0.418778,0.038002,0.000000
4,0.000000,0.688442,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...
763,0.588235,0.507538,0.212766,0.490313,0.039710,0.700000
764,0.117647,0.613065,0.000000,0.548435,0.111870,0.100000
765,0.294118,0.608040,0.132388,0.390462,0.071307,0.150000
766,0.058824,0.633166,0.000000,0.448584,0.115713,0.433333


Since BloodPressure and SkinThickness have low correletion i.e., below 0.1 we neglect it  

# Splitting Data

In [97]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.10)

In [98]:
xtrain

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age
606,0.058824,0.909548,0.346336,0.596125,0.503843,0.016667
391,0.294118,0.834171,0.000000,0.681073,0.111870,0.100000
18,0.058824,0.517588,0.098109,0.645306,0.044833,0.200000
261,0.176471,0.708543,0.000000,0.447094,0.291631,0.100000
35,0.235294,0.517588,0.226950,0.357675,0.379163,0.200000
...,...,...,...,...,...,...
172,0.117647,0.437186,0.000000,0.430700,0.296755,0.066667
128,0.058824,0.587940,0.171395,0.514158,0.138770,0.316667
50,0.058824,0.517588,0.096927,0.289121,0.176345,0.016667
517,0.411765,0.628141,0.000000,0.560358,0.096499,0.500000


In [99]:
ytrain

606    1
391    1
18     0
261    1
35     0
      ..
172    0
128    1
50     0
517    0
667    1
Name: Outcome, Length: 691, dtype: int64

# Model Selection

In [100]:
dtr=DecisionTreeClassifier()
rfc=RandomForestClassifier()
lgr=LogisticRegression()
svr=SVR()
knn=KNeighborsClassifier()
nbc=GaussianNB()

# Training

In [101]:
dtr.fit(xtrain,ytrain)
rfc.fit(xtrain,ytrain)
lgr.fit(xtrain,ytrain)
svr.fit(xtrain,ytrain)
knn.fit(xtrain,ytrain)
nbc.fit(xtrain,ytrain)

GaussianNB()

# Predicting

In [102]:
ypred_dtr=dtr.predict(xtest)
ypred_rfc=rfc.predict(xtest)
ypred_lgr=lgr.predict(xtest)
ypred_svr=svr.predict(xtest)
ypred_knn=knn.predict(xtest)
ypred_nbc=nbc.predict(xtest)


# Finding Efficient Model

In [103]:
accuracy_dtr=mean_squared_error(ytest,ypred_dtr)
accuracy_rfc=mean_squared_error(ytest,ypred_rfc)
accuracy_lgr=mean_squared_error(ytest,ypred_lgr)
accuracy_svr=mean_squared_error(ytest,ypred_svr)
accuracy_knn=mean_squared_error(ytest,ypred_knn)
accuracy_nbc=mean_squared_error(ytest,ypred_knn)

In [104]:
print ("RandomForestClassifier:", accuracy_rfc)
print ("Decision Tree Classifier:",accuracy_dtr)
print ("Logistic Regression:",accuracy_lgr)
print ("Support Vector Regression:",accuracy_svr)
print ("K Nearest Neighbor:",accuracy_svr)
print ("Naive Bayes Classification:",accuracy_svr)

RandomForestClassifier: 0.16883116883116883
Decision Tree Classifier: 0.24675324675324675
Logistic Regression: 0.18181818181818182
Support Vector Regression: 0.12647544819317394
K Nearest Neighbor: 0.12647544819317394
Naive Bayes Classification: 0.12647544819317394


Since Support Vector Regression/KNearestNeighbor/NaiveBayesClassification have high accuracy so we can use that Model

# K Nearest Neighbor

In [105]:
knn=KNeighborsClassifier()

TRAINING DATA

In [106]:
knn.fit(xtrain,ytrain)

KNeighborsClassifier()

PREDICTING 

In [107]:
ypred_knn=knn.predict(xtest)