# 1. Import Dataset

In [1]:
import numpy as np
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data[:,:2]
Y = iris.target

# 2. Import Libraries

- 1 ) Modeling
- 2 ) Hyperparameter Tuning

In [2]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from bayes_opt import BayesianOptimization

from sklearn.model_selection import cross_val_score, train_test_split

# 3. Modeling

## 1) Hyperparameter Tuning (X)

In [4]:
names = ['Decision Tree', 'Random Forest', 'Light GBM', 'XG Boost']
clf_list = [DecisionTreeClassifier(random_state=42), RandomForestClassifier(random_state=42), lgb.LGBMClassifier(random_state=42), xgb.XGBClassifier(objective = 'multi:softprob',random_state=42) ]

for name, clf in zip(names, clf_list):
    clf.fit(X,Y)
    print('---- {} ----'.format(name))
    print('cv score : ', cross_val_score(clf, X, Y, cv=5).mean())

---- Decision Tree ----
cv score :  0.7133333333333332
---- Random Forest ----




cv score :  0.76
---- Light GBM ----
cv score :  0.76
---- XG Boost ----
cv score :  0.7133333333333333


(Tuning 이전) RandomForest 와 LightGBM이 0.76으로 가장 좋다

## 2) Hyperparameter Tuning (O)

In [5]:
dt_params = {"criterion": ["gini", "entropy"],
              "min_samples_split": np.arange(2, 20,4),
              "max_depth": np.arange(1, 20,4),
              "min_samples_leaf": np.arange(1, 20,4),
              "max_leaf_nodes": np.arange(2, 20,4)}

rf_params = {'max_depth':np.arange(3, 30,3), 
            'n_estimators':np.arange(100, 400,80),
            'min_samples_split':np.arange(2, 10,2)}

lgbm_params ={'max_depth': np.arange(3, 30,6),
             'num_leaves': np.arange(10, 100,20), 
             'learning_rate': [ 0.01, 0.05, 0.01, 0.001],
             'min_child_samples': np.arange(2, 30,5),
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': np.linspace(0.6, 0.9, 30, endpoint=True), 
             'colsample_bytree': np.linspace(0.1, 0.8, 100, endpoint=True),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'n_estimators': np.arange(100, 400)}

xgb_params = {'eta':  np.linspace(0.001, 0.4, 50),
              'min_child_weight': [1, 5, 10],
              'gamma': np.arange(0, 20,4),
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'max_depth': np.arange(1, 500,50)}

params_list = [dt_params, rf_params, lgbm_params, xgb_params]

# [ 1. Grid Search ]

매우 오래 걸린다 ( 일일히 다 탐색해야하므로 )

In [6]:
import time

In [7]:
def hypertuning_gscv(est, p_distr,X,y):
    gdsearch = GridSearchCV(est, param_grid=p_distr, n_jobs=-1, cv=5)
    gdsearch.fit(X,y)
    bt_param = gdsearch.best_params_
    bt_score = gdsearch.best_score_    
    return bt_param, bt_score

In [8]:
best_param_dict = dict()
print('5-fold cross validation scores & best parameters :\n')

for name, clf, param_list in zip(names, clf_list, params_list):
    print('---- {} with Grid Search ----'.format(name))
    start = time.time()
    best_params = hypertuning_gscv(clf, param_list,X, Y)
    best_param_dict[name] = best_params[0]
    print('best_params : ', best_params[0])
    
    clf.set_params(**best_params[0])
    cv_score = cross_val_score(clf, X, Y, cv=5).mean()
    end = time.time()
    print('cv score : ', cv_score)
    print('time spent using Grid search ({}) is {}'.format(name,end - start),'\n')   

5-fold cross validation scores & best parameters :

---- Decision Tree with Grid Search ----
best_params :  {'criterion': 'gini', 'max_depth': 5, 'max_leaf_nodes': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
cv score :  0.8
time spent using Grid search (Decision Tree) is 16.065789222717285 

---- Random Forest with Grid Search ----
best_params :  {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 180}
cv score :  0.8066666666666666
time spent using Grid search (Random Forest) is 103.30277466773987 

---- Light GBM with Grid Search ----


MemoryError: 

# [ 2. Random Search ]

Grid Search보다는 효율적으로 공간을 탐색한다

In [None]:
def hypertuning_rscv(est, p_distr, nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr, n_jobs=-1, n_iter=nbr_iter, cv=5, random_state=0)
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

In [None]:
best_param_dict = dict()
print('5-fold cross validation scores & best parameters :\n')

for name, clf, param_list in zip(names, clf_list, params_list):
    print('---- {} with Random Search ----'.format(name))
    start = time.time()
    best_params = hypertuning_rscv(clf, param_list, 30, X, Y)
    best_param_dict[name] = best_params[0]
    print('best_params : ', best_params[0])
    
    clf.set_params(**best_params[0])
    cv_score = cross_val_score(clf, X, Y, cv=5).mean()
    end = time.time()
    print('cv score : ', cv_score)
    print('time spent using Random search ({}) is {}'.format(name,end - start),'\n')   

(Tuning 이후) 0.793으로 RF가 가장 좋다

# [ 3. Bayesian Optimization ]

가장 성능이 좋게 나왔던 RF을 대상으로 Bayesian Optimization을 해봐서 hyperparameter를 튜닝해볼 것이다

In [None]:
def RF_tune(max_depth, n_estimators, min_samples_split,X,Y):
    est = RandomForestClassifier(
    max_depth=max_depth,    
    n_estimators = n_estimators,
    min_samples_split = min_samples_split,
    random_state =42)
    cv = cross_val_score(est,X,Y,cv=5)         
    return cv.mean()

In [None]:
best_params = []
best_target=[]

def hypertuning_bayesopt(X,Y):
    def cross_val(max_depth, n_estimators, min_samples_split):
        return RF_tune(int(max_depth),int(n_estimators),int(min_samples_split),X,Y)        
    
    bayes_opt = BayesianOptimization(
        f = cross_val,
        pbounds={'max_depth':(3, 30), 
            'n_estimators':(100, 400),
            'min_samples_split':(2, 10)},
        random_state=42)
    
    bayes_opt.maximize(n_iter=10)
    best_target.append(bayes_opt.max['target'])
    best_params.append(bayes_opt.max['params'])
    print(bayes_opt.max['target'])
    print(bayes_opt.max['params'])

In [None]:
start = time.time()
hypertuning_bayesopt(X,Y)
end = time.time()

print('time spent using Bayesian Optimization is {}'.format(end - start),'\n')   

In [None]:
rf_bayes = RandomForestClassifier(random_state=42)
rf_bayes.set_params(**best_params[0])
cv_score = cross_val_score(rf_bayes, X, Y, cv=5).mean()

print('cv score : ', cv_score)