# <center style='color:green'>`RandomizedSearchCV` using Scikit-Learn</center>

# 1. Import required libraries

In [1]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# 2. Load `indian_liver_patient_dataset`

In [2]:
df = pd.read_csv('indian_liver_patient_dataset.csv')
df.head()

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Selector
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
df.shape

(583, 11)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        583 non-null    int64  
 1   Gender     583 non-null    object 
 2   TB         583 non-null    float64
 3   DB         583 non-null    float64
 4   Alkphos    583 non-null    int64  
 5   Sgpt       583 non-null    int64  
 6   Sgot       583 non-null    int64  
 7   TP         583 non-null    float64
 8   ALB        583 non-null    float64
 9   A/G Ratio  579 non-null    float64
 10  Selector   583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [5]:
df.Selector.unique(), df.Selector.value_counts()

(array([1, 2]),
 1    416
 2    167
 Name: Selector, dtype: int64)

# 3. Drop null values

In [6]:
df = df.dropna()
df.shape

(579, 11)

# 4. Perform preprocessing

In [7]:
df.Gender = df.Gender.apply(lambda x: 1 if x == 'Female' else 0)
df.head()

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Selector
0,65,1,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,0,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,0,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,0,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,0,3.9,2.0,195,27,59,7.3,2.4,0.4,1


# 5. Separate features and classes

In [8]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# 6. Apply `RandomizedSearchCV` in `Random Forest Classifier`

In [9]:
rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), param_distributions={
    'n_estimators': [2, 5, 10],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [3, 4, 5]
}, n_iter=15, cv=5, random_state=42)
rf.fit(X, y)
rf.cv_results_

{'mean_fit_time': array([0.00922832, 0.0042872 , 0.00231977, 0.00235291, 0.00230284,
        0.00715675, 0.00439348, 0.00761023, 0.00241609, 0.00247269,
        0.00398612, 0.0041687 , 0.00718241, 0.00691438, 0.00243516]),
 'std_fit_time': array([2.66602631e-03, 4.31555806e-05, 2.74156072e-05, 1.53378584e-05,
        4.73709338e-05, 1.45595150e-04, 3.40748781e-05, 5.61916933e-05,
        7.05933860e-05, 1.42836123e-04, 8.05276178e-06, 3.77658040e-05,
        6.72701137e-05, 8.59672731e-05, 2.98217329e-05]),
 'mean_score_time': array([0.00125208, 0.00097113, 0.00089698, 0.00089707, 0.00089769,
        0.00107579, 0.00096707, 0.00107017, 0.00090938, 0.00088973,
        0.00096006, 0.00094738, 0.00106554, 0.00102844, 0.00090079]),
 'std_score_time': array([3.02472532e-04, 1.51679194e-05, 3.06958712e-06, 5.88966280e-06,
        3.01352030e-06, 1.24127957e-05, 8.52912268e-06, 4.79357733e-06,
        1.57156731e-05, 4.06683675e-06, 1.49200510e-05, 5.75098034e-06,
        2.11085421e-05, 1.49

In [10]:
rf_results = pd.DataFrame(rf.cv_results_)
rf_results[['param_n_estimators', 'param_criterion', 'param_max_depth', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,mean_test_score,rank_test_score
0,10,gini,5,0.697841,5
1,5,entropy,4,0.69955,4
2,2,entropy,3,0.687436,8
3,2,log_loss,4,0.677106,11
4,2,gini,3,0.68054,10
5,10,entropy,3,0.713328,3
6,5,entropy,5,0.675397,13
7,10,entropy,5,0.696102,6
8,2,entropy,4,0.677106,11
9,2,log_loss,5,0.649445,14


In [11]:
rf.best_params_, rf.best_score_

({'n_estimators': 10, 'max_depth': 3, 'criterion': 'gini'}, 0.7184857571214393)

# 6. Apply `RandomizedSearchCV` in `K Neighbors Classifier`

In [12]:
knn = RandomizedSearchCV(estimator=KNeighborsClassifier(), param_distributions={
    'n_neighbors': [3, 4, 5],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan']
}, n_iter=8, cv=5, random_state=42)
knn.fit(X, y)
knn.cv_results_

{'mean_fit_time': array([0.0020164 , 0.00098567, 0.00099592, 0.00098882, 0.00096936,
        0.00096965, 0.00096922, 0.00096841]),
 'std_fit_time': array([1.94729285e-03, 2.02672060e-05, 2.42200421e-05, 2.53472030e-05,
        2.38132305e-06, 5.59953836e-06, 7.25889630e-06, 2.38609238e-06]),
 'mean_score_time': array([0.00402851, 0.00117178, 0.00379529, 0.00123234, 0.00116801,
        0.00121808, 0.00380607, 0.00123186]),
 'std_score_time': array([2.43825467e-04, 2.16220283e-05, 8.95694655e-05, 1.98248453e-05,
        1.58701550e-05, 1.10858230e-05, 1.51791581e-05, 9.67803244e-06]),
 'param_weights': masked_array(data=['uniform', 'distance', 'uniform', 'distance',
                    'distance', 'distance', 'uniform', 'distance'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_neighbors': masked_array(data=[3, 3, 4, 5, 4, 3, 5, 4],
              mask=[False, False, False, False, False, False, Fa

In [13]:
knn_results = pd.DataFrame(knn.cv_results_)
knn_results[['param_n_neighbors', 'param_weights', 'param_metric', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_neighbors,param_weights,param_metric,mean_test_score,rank_test_score
0,3,uniform,minkowski,0.647796,5
1,3,distance,minkowski,0.642609,7
2,4,uniform,euclidean,0.701304,1
3,5,distance,minkowski,0.659865,3
4,4,distance,minkowski,0.665067,2
5,3,distance,manhattan,0.653043,4
6,5,uniform,manhattan,0.646102,6
7,4,distance,manhattan,0.640885,8


In [14]:
knn.best_params_, knn.best_score_

({'weights': 'uniform', 'n_neighbors': 4, 'metric': 'euclidean'},
 0.701304347826087)

# 7. Apply `RandomizedSearchCV` in `Decision Tree Classifier`

In [15]:
dt = RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_distributions={
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [3, 4, 5]
}, n_iter=5, cv=5, random_state=42)
dt.fit(X, y)
dt.cv_results_

{'mean_fit_time': array([0.00171366, 0.00153446, 0.00166588, 0.0013639 , 0.0016531 ]),
 'std_fit_time': array([1.03515581e-04, 5.37376649e-05, 5.49717763e-05, 2.83594265e-05,
        3.74875897e-05]),
 'mean_score_time': array([0.00083151, 0.00076351, 0.00072598, 0.00075421, 0.00072522]),
 'std_score_time': array([2.57887360e-05, 2.29915629e-05, 4.76121365e-06, 1.51004624e-05,
        1.11486118e-05]),
 'param_max_depth': masked_array(data=[4, 4, 5, 3, 5],
              mask=[False, False, False, False, False],
        fill_value=999999),
 'param_criterion': masked_array(data=['log_loss', 'gini', 'entropy', 'gini', 'log_loss'],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 4, 'criterion': 'log_loss'},
  {'max_depth': 4, 'criterion': 'gini'},
  {'max_depth': 5, 'criterion': 'entropy'},
  {'max_depth': 3, 'criterion': 'gini'},
  {'max_depth': 5, 'criterion': 'log_loss'}],
 'split0_test_score': array([

In [16]:
dt_results = pd.DataFrame(dt.cv_results_)
dt_results[['param_criterion', 'param_max_depth', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score,rank_test_score
0,log_loss,4,0.706402,1
1,gini,4,0.697781,2
2,entropy,5,0.666717,4
3,gini,3,0.675352,3
4,log_loss,5,0.666717,4


In [17]:
dt.best_params_, dt.best_score_

({'max_depth': 4, 'criterion': 'log_loss'}, 0.7064017991004498)

# 8. Apply `RandomizedSearchCV` in all the models

In [18]:
clfs = [RandomForestClassifier(random_state=42), KNeighborsClassifier(), DecisionTreeClassifier(random_state=42)]

params_dict = {
    'RF': {
        'n_estimators': [2, 5, 10],
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': [3, 4, 5]
    },
    'KNN': {
        'n_neighbors': [3, 4, 5],
        'weights': ['uniform', 'distance'],
        'metric': ['minkowski', 'euclidean', 'manhattan']
    },
    'DT': {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': [3, 4, 5]
    }
}

scores = []

for clf, key, n in zip(clfs, params_dict.keys(), [15, 8, 5]):
    rs = RandomizedSearchCV(estimator=clf, param_distributions=params_dict[key], n_iter=n, cv=5, random_state=42)
    rs.fit(X, y)
    scores.append({'selected_model': clf, 'best_score_obtained': rs.best_score_, 'best_params_obtained': rs.best_params_})

In [19]:
pd.set_option('max_colwidth', None)
best = pd.DataFrame(scores, columns=['selected_model', 'best_score_obtained', 'best_params_obtained'])
best

Unnamed: 0,selected_model,best_score_obtained,best_params_obtained
0,RandomForestClassifier(random_state=42),0.718486,"{'n_estimators': 10, 'max_depth': 3, 'criterion': 'gini'}"
1,KNeighborsClassifier(),0.701304,"{'weights': 'uniform', 'n_neighbors': 4, 'metric': 'euclidean'}"
2,DecisionTreeClassifier(random_state=42),0.706402,"{'max_depth': 4, 'criterion': 'log_loss'}"


# `Random Forest Classifier` is better than other two models.