In [90]:
from sklearn.datasets import load_digits
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import warnings # supress warnings
warnings.filterwarnings('ignore')
import timeit
data = load_digits()
df_digit = pd.DataFrame(data.data, columns=data.feature_names)
df_digit['class'] = pd.DataFrame(data.target)
df_digit.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,class
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [91]:
X = df_digit.drop(columns='class')
y = df_digit['class']

In [92]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [93]:
clf = RandomForestClassifier()
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [94]:
import numpy as np
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
# reference: https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html

### Grid Search + Randomized search for Random Forest. Optimizing F1 score

In [95]:
param_grid = {'bootstrap': [True, False],'max_depth': [1,2,3],'n_estimators': [10, 50, 100, 200, 500],'max_features': [1,2,3,4,5,6,7,8]}
start_time = timeit.default_timer()
grid_search_clf = GridSearchCV(clf,param_grid,scoring="f1_macro",n_jobs=4) # for multi class we need macro
grid_search_clf.fit(X_train,y_train)
end_time = timeit.default_timer()
print("Time Grid SearchCv take to find best parameters: %0.3f sec" %(end_time-start_time))
print("Best parameters set found on development set:")
print()
print(grid_search_clf.best_params_)
print()
print("Top 3 best parameters are ")
report(grid_search_clf.cv_results_)

Time Grid SearchCv take to find best parameters: 292.241 sec
Best parameters set found on development set:

{'bootstrap': True, 'max_depth': 3, 'max_features': 3, 'n_estimators': 200}

Top 3 best parameters are 
Model with rank: 1
Mean validation score: 0.891 (std: 0.014)
Parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 3, 'n_estimators': 200}

Model with rank: 2
Mean validation score: 0.881 (std: 0.008)
Parameters: {'bootstrap': False, 'max_depth': 3, 'max_features': 3, 'n_estimators': 500}

Model with rank: 3
Mean validation score: 0.880 (std: 0.019)
Parameters: {'bootstrap': False, 'max_depth': 3, 'max_features': 2, 'n_estimators': 100}



In [97]:
n_iter_search = 20 #20 candidates parameter settings.
start_time = timeit.default_timer()
rand_search_clf = RandomizedSearchCV(clf, param_grid, scoring='f1_macro',n_iter=n_iter_search ) # for multi class we need macro
rand_search_clf.fit(X_train,y_train)
end_time = timeit.default_timer()
print("Time Grid SearchCv take to find best parameters: %0.3f sec" %(end_time-start_time))
print("Best parameters set found on development set:")
print()
print(rand_search_clf.best_params_)
print()
print("Top 3 best parameters are :")
report(rand_search_clf.cv_results_)

Time Grid SearchCv take to find best parameters: 32.067 sec
Best parameters set found on development set:

{'n_estimators': 500, 'max_features': 3, 'max_depth': 3, 'bootstrap': False}

Top 3 best parameters are :
Model with rank: 1
Mean validation score: 0.881 (std: 0.018)
Parameters: {'n_estimators': 500, 'max_features': 3, 'max_depth': 3, 'bootstrap': False}

Model with rank: 2
Mean validation score: 0.876 (std: 0.021)
Parameters: {'n_estimators': 200, 'max_features': 7, 'max_depth': 3, 'bootstrap': True}

Model with rank: 3
Mean validation score: 0.864 (std: 0.015)
Parameters: {'n_estimators': 200, 'max_features': 5, 'max_depth': 3, 'bootstrap': False}



### Grid search and random search for XGBoost classifier

In [98]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier()
xgb_model.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [99]:
param_grid_xg = {'learning_rate': [0.1, 0.2, 1],'max_depth': [1,2,3],'n_estimators': [10, 50, 100, 200, 500]}
start_time = timeit.default_timer()
grid_clf_xg = GridSearchCV(xgb_model,param_grid_xg,scoring="f1_macro",n_jobs=4) # for multi class we need macro
grid_clf_xg.fit(X_train,y_train)
end_time = timeit.default_timer()
print("Time Grid SearchCv take to find best parameters: %0.3f sec" %(end_time-start_time))
print("Best parameters set found on development set:")
print()
print(grid_clf_xg.best_params_)
print()
print("Top 3 best parameters are :")
report(grid_clf_xg.cv_results_)

Time Grid SearchCv take to find best parameters: 381.747 sec
Best parameters set found on development set:

{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 200}

Top 3 best parameters are :
Model with rank: 1
Mean validation score: 0.965 (std: 0.007)
Parameters: {'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 200}

Model with rank: 2
Mean validation score: 0.965 (std: 0.007)
Parameters: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 500}

Model with rank: 3
Mean validation score: 0.964 (std: 0.004)
Parameters: {'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 500}



In [102]:
n_iter_search = 5 #5 candidates parameter settings.
start_time = timeit.default_timer()
rand_clf_xg = RandomizedSearchCV(xgb_model,param_grid_xg,scoring="f1_macro",n_iter=n_iter_search) # for multi class we need macro
rand_clf_xg.fit(X_train,y_train)
end_time = timeit.default_timer()
print("Time Grid SearchCv take to find best parameters: %0.3f sec" %(end_time-start_time))
print("Best parameters set found on development set:")
print()
print(rand_clf_xg.best_params_)
print()
print("Top 3 best parameters are :")
report(rand_clf_xg.cv_results_)

Time Grid SearchCv take to find best parameters: 88.269 sec
Best parameters set found on development set:

{'n_estimators': 500, 'max_depth': 2, 'learning_rate': 0.2}

Top 3 best parameters are :
Model with rank: 1
Mean validation score: 0.964 (std: 0.004)
Parameters: {'n_estimators': 500, 'max_depth': 2, 'learning_rate': 0.2}

Model with rank: 2
Mean validation score: 0.952 (std: 0.009)
Parameters: {'n_estimators': 100, 'max_depth': 2, 'learning_rate': 0.1}

Model with rank: 3
Mean validation score: 0.951 (std: 0.009)
Parameters: {'n_estimators': 100, 'max_depth': 1, 'learning_rate': 1}



### Gridsearch takes much more time than randomized search as it performs searching on a set of candidates randomly compared to exhaustive search.