<a href="https://colab.research.google.com/github/sabumjung/DL-test/blob/master/xgboost_and_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install library via pip

In [1]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11685 sha256=c25ecd5d2a39889a978a9b84e46e0997fd63114d2bd8cea7f0752e1169763e0c
  Stored in directory: /root/.cache/pip/wheels/fd/9b/71/f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


## Import required libraries

In [41]:
#Import libraries
import pandas as pd
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
import xgboost as xgb

## We define a function to run Bayesian optimization given data, function to optimize and its hyperparameters:

In [3]:
#Bayesian optimization
def bayesian_optimization(dataset, function, parameters):
   X_train, y_train, X_test, y_test = dataset
   n_iterations = 5
   gp_params = {"alpha": 1e-4}

   BO = BayesianOptimization(function, parameters)
   BO.maximize(n_iter=n_iterations, **gp_params)

   return BO.max

In [4]:
def rfc_optimization(cv_splits):
    def function(n_estimators, max_depth, min_samples_split):
        return cross_val_score(
               RandomForestClassifier(
                   n_estimators=int(max(n_estimators,0)),                                                               
                   max_depth=int(max(max_depth,1)),
                   min_samples_split=int(max(min_samples_split,2)), 
                   n_jobs=-1, 
                   random_state=42,   
                   class_weight="balanced"),  
               X=X_train, 
               y=y_train, 
               cv=cv_splits,
               scoring="roc_auc",
               n_jobs=-1).mean()

    parameters = {"n_estimators": (10, 1000),
                  "max_depth": (1, 150),
                  "min_samples_split": (2, 10)}
    
    return function, parameters

In [5]:
def xgb_optimization(cv_splits, eval_set):
    def function(eta, gamma, max_depth):
            return cross_val_score(
                   xgb.XGBClassifier(
                       objective="binary:logistic",
                       learning_rate=max(eta, 0),
                       gamma=max(gamma, 0),
                       max_depth=int(max_depth),                                               
                       seed=42,
                       nthread=-1,
                       scale_pos_weight = len(y_train[y_train == 0])/
                                          len(y_train[y_train == 1])),  
                   X=X_train, 
                   y=y_train, 
                   cv=cv_splits,
                   scoring="roc_auc",
                   fit_params={
                        "early_stopping_rounds": 10, 
                        "eval_metric": "auc", 
                        "eval_set": eval_set},
                   n_jobs=-1).mean()

    parameters = {"eta": (0.001, 0.4),
                  "gamma": (0, 20),
                  "max_depth": (1, 2000)}
    
    return function, parameters

In [6]:
#Train model
def train(X_train, y_train, X_test, y_test, function, parameters):
    dataset = (X_train, y_train, X_test, y_test)
    cv_splits = 4
    
    best_solution = bayesian_optimization(dataset, function, parameters)      
    params = best_solution["params"]

    model = RandomForestClassifier(
             n_estimators=int(max(params["n_estimators"], 0)),
             max_depth=int(max(params["max_depth"], 1)),
             min_samples_split=int(max(params["min_samples_split"], 2)), 
             n_jobs=-1, 
             random_state=42,   
             class_weight="balanced")

    model.fit(X_train, y_train)
    
    return model

In [48]:
# Prepare the data.
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X = cancer["data"]
y = cancer["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                            stratify = y,
                                        random_state = 42)

In [49]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [68]:
jj = rfc_optimization(10)
print(jj[1])
print(jj[0])

{'n_estimators': (10, 1000), 'max_depth': (1, 150), 'min_samples_split': (2, 10)}
<function rfc_optimization.<locals>.function at 0x7fc6ebd35320>


In [69]:
train(X_train, y_train, X_test, y_test, function=jj[0], parameters = jj[1])

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.9928  [0m | [0m 54.29   [0m | [0m 4.513   [0m | [0m 227.3   [0m |
| [95m 2       [0m | [95m 0.9935  [0m | [95m 62.39   [0m | [95m 8.704   [0m | [95m 579.6   [0m |
| [0m 3       [0m | [0m 0.9934  [0m | [0m 27.43   [0m | [0m 2.74    [0m | [0m 466.1   [0m |
| [0m 4       [0m | [0m 0.9932  [0m | [0m 111.6   [0m | [0m 5.606   [0m | [0m 128.2   [0m |
| [95m 5       [0m | [95m 0.9937  [0m | [95m 114.1   [0m | [95m 3.086   [0m | [95m 659.0   [0m |
| [0m 6       [0m | [0m 0.9935  [0m | [0m 69.42   [0m | [0m 5.763   [0m | [0m 759.4   [0m |
| [0m 7       [0m | [0m 0.9937  [0m | [0m 114.2   [0m | [0m 3.402   [0m | [0m 657.9   [0m |
| [0m 8       [0m | [0m 0.9937  [0m | [0m 149.8   [0m | [0m 8.915   [0m | [0m 997.9   [0m |
| [0m 9       [0m | [0m 0.9932  [0m | [0m 6.0

RandomForestClassifier(class_weight='balanced', max_depth=114,
                       min_samples_split=3, n_estimators=659, n_jobs=-1,
                       random_state=42)

In [63]:
eval_set=[(X_test, y_test)]
eval_set

[(array([[1.275e+01, 1.670e+01, 8.251e+01, ..., 8.045e-02, 3.071e-01,
          8.557e-02],
         [1.799e+01, 2.066e+01, 1.178e+02, ..., 1.974e-01, 3.060e-01,
          8.503e-02],
         [1.496e+01, 1.910e+01, 9.703e+01, ..., 1.489e-01, 2.962e-01,
          8.472e-02],
         ...,
         [1.170e+01, 1.911e+01, 7.433e+01, ..., 5.741e-02, 3.487e-01,
          6.958e-02],
         [1.450e+01, 1.089e+01, 9.428e+01, ..., 1.221e-01, 2.889e-01,
          8.006e-02],
         [1.218e+01, 1.784e+01, 7.779e+01, ..., 5.882e-02, 2.227e-01,
          7.376e-02]]),
  array([1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
         1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
         1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
         1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
         0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 

In [64]:
jj1 = xgb_optimization(10, eval_set)  # cv_splits, eval_set

In [65]:
jj1[0]

<function __main__.xgb_optimization.<locals>.function>

In [66]:
jj1[1]

{'eta': (0.001, 0.4), 'gamma': (0, 20), 'max_depth': (1, 2000)}

In [67]:
train(X_train, y_train, X_test, y_test, function=jj1[0], parameters = jj1[1])

|   iter    |  target   |    eta    |   gamma   | max_depth |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.9851  [0m | [0m 0.3287  [0m | [0m 4.998   [0m | [0m 1.024e+0[0m |
| [95m 2       [0m | [95m 0.9901  [0m | [95m 0.3012  [0m | [95m 2.481   [0m | [95m 1.876e+0[0m |
| [0m 3       [0m | [0m 0.9861  [0m | [0m 0.3177  [0m | [0m 7.451   [0m | [0m 1.684e+0[0m |
| [0m 4       [0m | [0m 0.9813  [0m | [0m 0.3134  [0m | [0m 8.16    [0m | [0m 629.4   [0m |
| [0m 5       [0m | [0m 0.9482  [0m | [0m 0.007783[0m | [0m 7.863   [0m | [0m 1.958e+0[0m |
| [95m 6       [0m | [95m 0.992   [0m | [95m 0.1276  [0m | [95m 1.096   [0m | [95m 1.875e+0[0m |
| [0m 7       [0m | [0m 0.9918  [0m | [0m 0.3692  [0m | [0m 0.04656 [0m | [0m 1.853e+0[0m |
| [0m 8       [0m | [0m 0.9878  [0m | [0m 0.2563  [0m | [0m 1.176   [0m | [0m 1.82e+03[0m |
| [0m 9       [0m | [0m 0.9805  [0m | [0m 0.3

KeyError: ignored