importing dependencies

In [184]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


Data Collection and Analysis

In [185]:
diabetes_data = pd.read_csv('diabetes.csv')

In [186]:
diabetes_data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [187]:
diabetes_data.shape

(768, 9)

In [188]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [189]:
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [190]:
diabetes_data['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [191]:
diabetes_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Data Preprocessing

Splitting of data features and outcome

In [192]:
X = diabetes_data.drop('Outcome',axis=1)
Y=diabetes_data['Outcome']

In [193]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [194]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


Scalarization of the Data : as the values in the data is so much varying , we need to scalazrize the data

In [195]:
se = StandardScaler()

In [196]:
standarized_X =se.fit_transform(X)

In [197]:
print(standarized_X)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [198]:
diabetes_data_ = pd.DataFrame(standarized_X,columns=X.columns)

In [199]:
diabetes_data_.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496


Model selection and Hyperparameter Tuning 

Importing models 

In [200]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

Model training of different models and thier cross val scores : 

In [201]:
models = [LogisticRegression(),SVC(),KNeighborsClassifier(),DecisionTreeClassifier(random_state=0),RandomForestClassifier(random_state=0)]
for model in models :
    model.fit(diabetes_data_,Y)
    cross_score = cross_val_score(model,diabetes_data_,Y,cv=5)
    print(f'The Accuracy Scores of the {model} is {cross_score}')
    cross_score1 = sum(cross_score)/ len(cross_score)
    cross_score1 = cross_score1 *100
    cross_score1 = round(cross_score1,2)
    print(f'The Accuracy of the Model is {cross_score1}')
    print('--------------------------------------------------------------------------------------------------------------')

The Accuracy Scores of the LogisticRegression() is [0.77272727 0.74675325 0.75324675 0.81699346 0.76470588]
The Accuracy of the Model is 77.09
--------------------------------------------------------------------------------------------------------------
The Accuracy Scores of the SVC() is [0.76623377 0.75324675 0.74675325 0.81045752 0.77777778]
The Accuracy of the Model is 77.09
--------------------------------------------------------------------------------------------------------------
The Accuracy Scores of the KNeighborsClassifier() is [0.72077922 0.73376623 0.71428571 0.77124183 0.7254902 ]
The Accuracy of the Model is 73.31
--------------------------------------------------------------------------------------------------------------
The Accuracy Scores of the DecisionTreeClassifier(random_state=0) is [0.68831169 0.66233766 0.68181818 0.78431373 0.7124183 ]
The Accuracy of the Model is 70.58
------------------------------------------------------------------------------------------

Model Selection 

Hyperparamter Tuning and then the respective cross val score of the Optimized Models

In [202]:
model_parameters = {
    'LR_parameter' : {
        'C' : [1,5,10,15,20]
    },
    'SVC_parameter' : {
        'C' : [1,5,10,15,20],
        'kernel' : ['linear','poly','rbf','sigmoid']
    },
    'KNN_parameter' : {
        'n_neighbors' : [3,5,10]
    },
    'decision_tree_parameters': {
        'max_depth' : [3,5,10,None]
    },
    'random_parameter' : {
        'n_estimators' : [10,20,30,50,100]
    }
}

In [203]:
model_parameters.keys()

dict_keys(['LR_parameter', 'SVC_parameter', 'KNN_parameter', 'decision_tree_parameters', 'random_parameter'])

In [204]:
model_parameters.values()

dict_values([{'C': [1, 5, 10, 15, 20]}, {'C': [1, 5, 10, 15, 20], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}, {'n_neighbors': [3, 5, 10]}, {'max_depth': [3, 5, 10, None]}, {'n_estimators': [10, 20, 30, 50, 100]}])

In [205]:
model_keys = list(model_parameters.keys())

In [206]:
print(model_keys)

['LR_parameter', 'SVC_parameter', 'KNN_parameter', 'decision_tree_parameters', 'random_parameter']


Model selection using GridsearchCV

Model selection Function

In [207]:
model_list = [LogisticRegression(),SVC(),KNeighborsClassifier(),DecisionTreeClassifier(random_state=0),RandomForestClassifier(random_state=0)]

In [208]:
def model_selection(models,model_parameters):
    results =[]

    i=0
    for model in models:
        keys = model_keys[i]
        params = model_parameters[keys]
        print(keys,': ',params)
        i+=1

        classifier = GridSearchCV(model,params,cv=5)
        classifier.fit(diabetes_data_,Y)

        results.append({
            'Models used : ' : model.__class__.__name__,
            'Model Best Parameters : ' : classifier.best_params_,
            'Best Score : ' : classifier.best_score_
        })

    result_data = pd.DataFrame(results,columns=['Models used : ','Model Best Parameters : ','Best Score : '])
    return result_data
    


In [209]:
model_selection(models=model_list,model_parameters=model_parameters)

LR_parameter :  {'C': [1, 5, 10, 15, 20]}
SVC_parameter :  {'C': [1, 5, 10, 15, 20], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
KNN_parameter :  {'n_neighbors': [3, 5, 10]}
decision_tree_parameters :  {'max_depth': [3, 5, 10, None]}
random_parameter :  {'n_estimators': [10, 20, 30, 50, 100]}


Unnamed: 0,Models used :,Model Best Parameters :,Best Score :
0,LogisticRegression,{'C': 1},0.770885
1,SVC,"{'C': 5, 'kernel': 'rbf'}",0.777455
2,KNeighborsClassifier,{'n_neighbors': 3},0.742254
3,DecisionTreeClassifier,{'max_depth': 5},0.746176
4,RandomForestClassifier,{'n_estimators': 100},0.774815


Inference : In our Case , The RandomForestclassifier Works best with n_estimators = 100 with the Accuracy of 77.48 %.