In [3]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

In [4]:
#Importing PIMA Diabetes dataset
df = pd.read_csv("diabetes.csv")
print(df.shape)
df.head()


(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [11]:
any(df['Glucose'].value_counts().keys()==0)

True

In [13]:
df[df['Glucose']==0].shape

(5, 9)

In [7]:
df['Glucose'] = np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin'] = np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])


In [8]:
print(any(df['Glucose'].value_counts().keys()==0))
print(any(df['Insulin'].value_counts().keys()==0))
print(any(df['SkinThickness'].value_counts().keys()==0))

False
False
False


In [16]:
df[df['Glucose']==0].shape

(0, 9)

In [23]:
df[df['SkinThickness']==0]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
2,8,183.0,64,0,30.5,23.3,0.672,32,1
5,5,116.0,74,0,30.5,25.6,0.201,30,0
7,10,115.0,0,0,30.5,35.3,0.134,29,0
9,8,125.0,96,0,30.5,0.0,0.232,54,1
10,4,110.0,92,0,30.5,37.6,0.191,30,0
...,...,...,...,...,...,...,...,...,...
757,0,123.0,72,0,30.5,36.3,0.258,52,1
758,1,106.0,76,0,30.5,37.5,0.197,26,0
759,6,190.0,92,0,30.5,35.5,0.278,66,1
762,9,89.0,62,0,30.5,22.5,0.142,33,0


**No need to do scaling as we are going to use Random Forest Classifier. It won't have any impact**

### Split dataset for X and Y

In [9]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X.shape,y.shape

((768, 8), (768,))

In [27]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=10)
rf_classifier.fit(X_train,y_train)
y_test_pred = rf_classifier.predict(X_test)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [13]:
print(confusion_matrix(y_test,y_test_pred))
print(classification_report(y_test,y_test_pred))
print(accuracy_score(y_test,y_test_pred))

[[87 12]
 [29 26]]
              precision    recall  f1-score   support

           0       0.75      0.88      0.81        99
           1       0.68      0.47      0.56        55

    accuracy                           0.73       154
   macro avg       0.72      0.68      0.68       154
weighted avg       0.73      0.73      0.72       154

0.7337662337662337


### Manual Hyperparameter Tuning

In [68]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=300, criterion='entropy',max_features='sqrt', min_samples_leaf=10)
rf_classifier.fit(X_train,y_train)
y_test_pred = rf_classifier.predict(X_test)

In [69]:
print(confusion_matrix(y_test,y_test_pred))
print(classification_report(y_test,y_test_pred))
print(accuracy_score(y_test,y_test_pred))

[[88 11]
 [28 27]]
              precision    recall  f1-score   support

           0       0.76      0.89      0.82        99
           1       0.71      0.49      0.58        55

    accuracy                           0.75       154
   macro avg       0.73      0.69      0.70       154
weighted avg       0.74      0.75      0.73       154

0.7467532467532467


### Randomized Search CV

In [14]:
from sklearn.model_selection import RandomizedSearchCV


In [71]:
n_estimators = [int(x) for x in np.linspace(200,2000,10)]
max_features = ['auto','sqrt','logs']
max_depth = [int(x) for x in np.linspace(10,1000,10)]
min_samples_split = [2,3,10,14]
min_samples_leaf = [1,2,4,6,8]

In [72]:
random_grid = {
    "n_estimators" : n_estimators,
    "max_features" : max_features,
    "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "criterion"  : ['gini','entropy']
}

print(random_grid)    

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'logs'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 3, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['gini', 'entropy']}


In [73]:
rf = RandomForestClassifier()
rf_randomcv = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter=100, cv=3,verbose=2,
                                 random_state=100,n_jobs=-1)
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'logs'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 3, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [74]:
rf_randomcv.best_params_

{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 1000,
 'criterion': 'gini'}

In [75]:
best_model = rf_randomcv.best_estimator_

In [76]:
best_model

RandomForestClassifier(max_depth=1000, max_features='sqrt', min_samples_leaf=2,
                       n_estimators=1400)

In [79]:
y_pred = best_model.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy score:", accuracy_score(y_test,y_pred))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[86 13]
 [24 31]]
Accuracy score: 0.7597402597402597
Classification report:               precision    recall  f1-score   support

           0       0.78      0.87      0.82        99
           1       0.70      0.56      0.63        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.72       154
weighted avg       0.75      0.76      0.75       154



### GridSearchCV

In [80]:
from sklearn.model_selection import GridSearchCV

In [81]:
rf_randomcv.best_params_

{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 1000,
 'criterion': 'gini'}

In [83]:
type(rf_randomcv.best_params_)

dict

In [84]:
param_grid = {
    "n_estimators" : [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100,
                      rf_randomcv.best_params_['n_estimators'], rf_randomcv.best_params_['n_estimators'] + 100, 
                      rf_randomcv.best_params_['n_estimators'] + 200],
    "min_samples_split" : [rf_randomcv.best_params_['min_samples_split'],
                           rf_randomcv.best_params_['min_samples_split'] + 1,
                           rf_randomcv.best_params_['min_samples_split'] + 2],
    "min_samples_leaf" : [rf_randomcv.best_params_['min_samples_leaf'],
                           rf_randomcv.best_params_['min_samples_leaf'] + 1,
                           rf_randomcv.best_params_['min_samples_leaf'] + 2],
    "max_features" : [rf_randomcv.best_params_['max_features']],
    'max_depth' : [rf_randomcv.best_params_['max_depth']],
    "criterion" : [rf_randomcv.best_params_['criterion']]
}


In [87]:
rf = RandomForestClassifier()
rf_gridcv = GridSearchCV(estimator = rf, param_grid = param_grid, cv=10,verbose=2,
                                 n_jobs=-1)
rf_gridcv.fit(X_train,y_train)

Fitting 10 folds for each of 45 candidates, totalling 450 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [1000],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [1200, 1300, 1400, 1500, 1600]},
             verbose=2)

In [92]:
rf_gridcv.best_params_

{'criterion': 'gini',
 'max_depth': 1000,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 4,
 'n_estimators': 1500}

In [93]:
best_model = rf_gridcv.best_estimator_

In [94]:
y_pred_grid = best_model.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy score:", accuracy_score(y_test,y_pred))
print("Classification report: {}".format(classification_report(y_test,y_pred)))


[[86 13]
 [24 31]]
Accuracy score: 0.7597402597402597
Classification report:               precision    recall  f1-score   support

           0       0.78      0.87      0.82        99
           1       0.70      0.56      0.63        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.72       154
weighted avg       0.75      0.76      0.75       154



### Automated Hyperparameter Tuning

#### Bayesian Optimization

#### Bayesian Optimization


In [101]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

In [98]:
space = {
    "criterion" : hp.choice('criterion' , ['entropy','gini']),
    "max_depth" : hp.quniform('max_depth', 10, 1200, 10),
    "max_features" : hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    "min_samples_leaf" : hp.uniform('min_samples_leaf', 0, 0.5),
    "min_samples_split" : hp.uniform ('min_samples_split', 0, 1),
    'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
}
    

In [99]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x1e9085b3700>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x1e9085b33d0>,
 'max_features': <hyperopt.pyll.base.Apply at 0x1e9085a12e0>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x1e906cfa460>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x1e906cfaeb0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x1e906cfa4c0>}

In [100]:
def objective(space):
    model = RandomForestClassifier(criterion=space['criterion'],max_depth=space['max_depth'],
                                   max_features=space['max_features'], min_samples_leaf=space['min_samples_leaf'],
                                   min_samples_split=space['min_samples_split'], n_estimators = space['n_estimators'])
    accuracy =  cross_val_score(model, X_train, y_train, cv=5).mean()
    
    return {'loss': -accuracy, 'status': STATUS_OK }

In [102]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo = tpe.suggest,
            max_evals=80,
            trials = trials)
            
best

100%|██████████| 80/80 [04:32<00:00,  3.41s/trial, best loss: -0.7769159003065441]


{'criterion': 1,
 'max_depth': 860.0,
 'max_features': 1,
 'min_samples_leaf': 0.015700478345609198,
 'min_samples_split': 0.01501701733755585,
 'n_estimators': 5}

In [104]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])
print(best['min_samples_leaf'])

gini
sqrt
1300
0.015700478345609198


In [105]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[89 10]
 [28 27]]
0.7532467532467533
              precision    recall  f1-score   support

           0       0.76      0.90      0.82        99
           1       0.73      0.49      0.59        55

    accuracy                           0.75       154
   macro avg       0.75      0.69      0.71       154
weighted avg       0.75      0.75      0.74       154



### Genetic Algorithm



In [15]:
#Initialize parameters

n_estimators = [int(x) for x in np.linspace(200,2000,10)]
max_features = ['auto','sqrt','logs']
max_depth = [int(x) for x in np.linspace(10,1000,10)]
min_samples_split = [2,3,10,14]
min_samples_leaf = [1,2,4,6,8]



In [16]:
param = {
    "n_estimators" : n_estimators,
    "max_features" : max_features,
    "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "criterion"  : ['gini','entropy']
}


In [24]:
from tpot import TPOTClassifier


tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

Version 0.11.7 of tpot is outdated. Version 0.12.2 was released Friday February 23, 2024.


Optimization Progress:   0%|          | 0/84 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7818309141838553

Generation 2 - Current best internal CV score: 0.7818309141838553

Generation 3 - Current best internal CV score: 0.7818309141838553

Generation 4 - Current best internal CV score: 0.7818309141838553

Generation 5 - Current best internal CV score: 0.7818309141838553

Best pipeline: RandomForestClassifier(input_matrix, criterion=gini, max_depth=890, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=600)


In [25]:
accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)

0.7662337662337663


### Optuna


**Similar to Hyperopt**

In [27]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()

In [28]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2024-06-25 17:50:22,554] A new study created in memory with name: no-name-3760b865-3e4a-4699-aa48-c045c6cd5b13
[I 2024-06-25 17:50:29,330] Trial 0 finished with value: 0.7736649131197195 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1840, 'max_depth': 12.871241963105277}. Best is trial 0 with value: 0.7736649131197195.
[I 2024-06-25 17:50:33,299] Trial 1 finished with value: 0.7687788936712897 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1130, 'max_depth': 10.681870651456311}. Best is trial 0 with value: 0.7736649131197195.
[I 2024-06-25 17:50:35,424] Trial 2 finished with value: 0.772030926191615 and parameters: {'classifier': 'RandomForest', 'n_estimators': 280, 'max_depth': 57.42401360079446}. Best is trial 0 with value: 0.7736649131197195.
[I 2024-06-25 17:50:36,362] Trial 3 finished with value: 0.7703809979276263 and parameters: {'classifier': 'RandomForest', 'n_estimators': 370, 'max_depth': 19.334830820000853}. Best is trial 0 with value: 0.

Accuracy: 0.7785509325681493
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 1540, 'max_depth': 10.906874632415755}


In [29]:
trial

FrozenTrial(number=61, state=TrialState.COMPLETE, values=[0.7785509325681493], datetime_start=datetime.datetime(2024, 6, 25, 17, 53, 20, 620923), datetime_complete=datetime.datetime(2024, 6, 25, 17, 53, 24, 385448), params={'classifier': 'RandomForest', 'n_estimators': 1540, 'max_depth': 10.906874632415755}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'n_estimators': IntDistribution(high=2000, log=False, low=200, step=10), 'max_depth': FloatDistribution(high=100.0, log=True, low=10.0, step=None)}, trial_id=61, value=None)

In [30]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 1540,
 'max_depth': 10.906874632415755}

In [31]:
rf=RandomForestClassifier(n_estimators=1540,max_depth=11)
rf.fit(X_train,y_train)

In [32]:
y_pred=rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[86 13]
 [24 31]]
0.7597402597402597
              precision    recall  f1-score   support

           0       0.78      0.87      0.82        99
           1       0.70      0.56      0.63        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.72       154
weighted avg       0.75      0.76      0.75       154

