# <center style='color:blue'>`GridSearchCV` using Scikit-Learn</center>

# 1. Import required libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier 

# 2. Load `early_stage_diabetes_risk_prediction` dataset

In [2]:
df = pd.read_csv('early_stage_diabetes_risk_prediction.csv')
df.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [3]:
df.shape

(520, 17)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   age                 520 non-null    int64 
 1   gender              520 non-null    object
 2   polyuria            520 non-null    object
 3   polydipsia          520 non-null    object
 4   sudden_weight_loss  520 non-null    object
 5   weakness            520 non-null    object
 6   polyphagia          520 non-null    object
 7   genital_thrush      520 non-null    object
 8   visual_blurring     520 non-null    object
 9   itching             520 non-null    object
 10  irritability        520 non-null    object
 11  delayed_healing     520 non-null    object
 12  partial_paresis     520 non-null    object
 13  muscle_stiffness    520 non-null    object
 14  alopecia            520 non-null    object
 15  obesity             520 non-null    object
 16  class               520 no

In [5]:
df['class'].unique(), df['class'].value_counts()

(array(['Positive', 'Negative'], dtype=object),
 Positive    320
 Negative    200
 Name: class, dtype: int64)

# 3. Perform preprocessing

In [6]:
labelencoder = LabelEncoder()
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = labelencoder.fit_transform(df[i])
df.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [7]:
df['class'].unique(), df['class'].value_counts()

(array([1, 0]),
 1    320
 0    200
 Name: class, dtype: int64)

# 4. Separate features and classes

In [8]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# 5. Apply `GridSearchCV` in `Random Forest Classifier`

In [9]:
rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid={
    'n_estimators': [2, 5, 10],
    'max_depth': [3, 4, 5]
}, cv=5)
rf.fit(X, y)
rf.cv_results_

{'mean_fit_time': array([0.00638633, 0.00625811, 0.01001749, 0.00324454, 0.00571418,
        0.00969992, 0.00323639, 0.00569711, 0.0097929 ]),
 'std_fit_time': array([3.15680893e-03, 1.14763679e-04, 2.10641302e-04, 1.11641011e-05,
        2.99262332e-05, 4.89938520e-05, 1.01237818e-05, 4.90888975e-05,
        6.07088051e-05]),
 'mean_score_time': array([0.00201373, 0.00149479, 0.00154037, 0.00124211, 0.00134487,
        0.00146408, 0.00123844, 0.00133634, 0.00149431]),
 'std_score_time': array([7.47824741e-04, 5.38536011e-05, 3.06170303e-05, 5.41666863e-06,
        1.69866503e-05, 7.74533863e-06, 1.45295342e-05, 1.02436786e-05,
        2.02409370e-05]),
 'param_max_depth': masked_array(data=[3, 3, 3, 4, 4, 4, 5, 5, 5],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=999999),
 'param_n_estimators': masked_array(data=[2, 5, 10, 2, 5, 10, 2, 5, 10],
              mask=[False, False, False, False, False, False, Fals

In [10]:
rf_results = pd.DataFrame(rf.cv_results_)
rf_results[['param_n_estimators', 'param_max_depth', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_test_score,rank_test_score
0,2,3,0.859615,9
1,5,3,0.898077,7
2,10,3,0.903846,6
3,2,4,0.896154,8
4,5,4,0.917308,4
5,10,4,0.932692,2
6,2,5,0.915385,5
7,5,5,0.928846,3
8,10,5,0.946154,1


In [11]:
rf.best_params_, rf.best_score_

({'max_depth': 5, 'n_estimators': 10}, 0.9461538461538461)

# 6. Apply `GridSearchCV` in `Extra Trees Classifier`

In [12]:
etc = GridSearchCV(estimator=ExtraTreesClassifier(random_state=42), param_grid={
    'n_estimators': [2, 5, 10],
    'max_depth': [3, 4, 5]
}, cv=5)
etc.fit(X, y)
etc.cv_results_

{'mean_fit_time': array([0.00512743, 0.00474739, 0.00749717, 0.00280056, 0.00460291,
        0.00756168, 0.00283279, 0.0046504 , 0.00768795]),
 'std_fit_time': array([2.26077951e-03, 4.23330970e-05, 9.76413710e-05, 1.07194011e-05,
        4.80232074e-05, 6.19233225e-05, 4.60059152e-05, 2.60810311e-05,
        4.59732847e-05]),
 'mean_score_time': array([0.00189385, 0.00140924, 0.00148983, 0.00125322, 0.00134478,
        0.00149531, 0.00125256, 0.00135055, 0.00149665]),
 'std_score_time': array([5.59477340e-04, 4.22315696e-05, 5.02243323e-06, 9.53865032e-06,
        7.39574279e-06, 7.42458577e-06, 6.35929885e-06, 3.31941686e-06,
        8.12947923e-06]),
 'param_max_depth': masked_array(data=[3, 3, 3, 4, 4, 4, 5, 5, 5],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=999999),
 'param_n_estimators': masked_array(data=[2, 5, 10, 2, 5, 10, 2, 5, 10],
              mask=[False, False, False, False, False, False, Fals

In [13]:
etc_results = pd.DataFrame(etc.cv_results_)
etc_results[['param_n_estimators', 'param_max_depth', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_test_score,rank_test_score
0,2,3,0.882692,9
1,5,3,0.884615,8
2,10,3,0.901923,5
3,2,4,0.890385,7
4,5,4,0.9,6
5,10,4,0.921154,3
6,2,5,0.911538,4
7,5,5,0.932692,2
8,10,5,0.951923,1


In [14]:
etc.best_params_, etc.best_score_

({'max_depth': 5, 'n_estimators': 10}, 0.9519230769230769)

# 7. Apply `GridSearchCV` in `Gradient Boosting Classifier`

In [15]:
gbc = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_grid={
    'n_estimators': [2, 5, 10],
    'max_depth': [3, 4, 5]
}, cv=5)
gbc.fit(X, y)
gbc.cv_results_

{'mean_fit_time': array([0.00667319, 0.00592556, 0.00976319, 0.00367956, 0.00684018,
        0.01227489, 0.0041925 , 0.00790458, 0.01422453]),
 'std_fit_time': array([2.87843324e-03, 1.28372640e-04, 1.51855933e-04, 3.06032885e-05,
        1.38837049e-04, 3.04190887e-04, 1.22230172e-04, 1.15168306e-04,
        1.81138568e-04]),
 'mean_score_time': array([0.00210681, 0.00120802, 0.00117965, 0.00116491, 0.00117407,
        0.0012002 , 0.00117173, 0.00118227, 0.00120182]),
 'std_score_time': array([8.39618743e-04, 2.14743020e-05, 7.55361835e-06, 6.94285382e-06,
        4.27720539e-06, 3.23290670e-05, 1.86641398e-05, 1.91813044e-05,
        4.41648522e-05]),
 'param_max_depth': masked_array(data=[3, 3, 3, 4, 4, 4, 5, 5, 5],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=999999),
 'param_n_estimators': masked_array(data=[2, 5, 10, 2, 5, 10, 2, 5, 10],
              mask=[False, False, False, False, False, False, Fals

In [16]:
gbc_results = pd.DataFrame(gbc.cv_results_)
gbc_results[['param_n_estimators', 'param_max_depth', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_test_score,rank_test_score
0,2,3,0.623077,9
1,5,3,0.915385,5
2,10,3,0.917308,4
3,2,4,0.638462,8
4,5,4,0.913462,6
5,10,4,0.944231,3
6,2,5,0.678846,7
7,5,5,0.959615,1
8,10,5,0.957692,2


In [17]:
gbc.best_params_, gbc.best_score_

({'max_depth': 5, 'n_estimators': 5}, 0.9596153846153846)

# 8. Apply `GridSearchCV` in all the models

In [18]:
clfs = [RandomForestClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        GradientBoostingClassifier(random_state=42)]

params_dict = {
    'n_estimators': [2, 5, 10],
    'max_depth': [3, 4, 5]
}

scores = []

for clf in clfs:
    gs = GridSearchCV(clf, param_grid=params_dict, cv=5)
    gs.fit(X, y)
    scores.append({'selected_model': clf, 'best_score_obtained': gs.best_score_, 'best_params_obtained': gs.best_params_})

In [19]:
best = pd.DataFrame(scores, columns=['selected_model', 'best_score_obtained', 'best_params_obtained'])
best

Unnamed: 0,selected_model,best_score_obtained,best_params_obtained
0,RandomForestClassifier(random_state=42),0.946154,"{'max_depth': 5, 'n_estimators': 10}"
1,ExtraTreesClassifier(random_state=42),0.951923,"{'max_depth': 5, 'n_estimators': 10}"
2,GradientBoostingClassifier(random_state=42),0.959615,"{'max_depth': 5, 'n_estimators': 5}"


# `Gradient Boosting Classifier` is better than other two models.