In [None]:
# Load packages
import time
from scipy.stats import uniform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, accuracy_score

from hyperopt import hp, fmin, tpe
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from bayes_opt import BayesianOptimization

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

In [None]:
# Make scorer: accuracy
acc_score = make_scorer(accuracy_score)

In [None]:
# Load dataset
trainSet = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
testSet = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
submitSet = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

trainSet.head()

# Remove not used variables
train = trainSet.drop(columns=['Name', 'Ticket'])
train['Cabin_letter'] = train['Cabin'].str[0:1]
train['Cabin_no'] = train['Cabin'].str[1:]

train.head()

# Feature generation: training data
train = trainSet.drop(columns=['Name', 'Ticket', 'Cabin'])
train = train.dropna(axis=0)
train = pd.get_dummies(train)

train.head()

In [None]:
# train validation split
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=['PassengerId','Survived'], axis=0),
                                                  train['Survived'],
                                                  test_size=0.2, random_state=111,
                                                  stratify=train['Survived'])

In [None]:
# GridSearchCV
param_grid = {'max_depth':[3,4,5,6,7,8,9,10],
              'max_features':[0.8,0.9,1],
              'learning_rate':[0.01,0.1,1],
              'n_estimators':[80,100,120,140,150],
              'subsample': [0.8,0.9,1]}

grid = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_grid, scoring=acc_score, cv=5)

grid.fit(X_train.iloc[1:100,], y_train.iloc[1:100,])

In [None]:
# RandomizedSearhCV
param_rand = {'max_depth':uniform(3,10),
              'max_features':uniform(0.8,1),
              'learning_rate':uniform(0.01,1),
              'n_estimators':uniform(80,150),
              'subsample':uniform(0.8,1)}

rand = RandomizedSearchCV(estimator=GradientBoostingClassifier(), param_distributions=param_rand, scoring=acc_score, cv=5)

rand.fit(X_train.iloc[1:100,], y_train.iloc[1:100,])

In [None]:
# RandomizedSearhCV
param_rand = {'max_depth':uniform(3,10),
              'max_features':uniform(0.8,1),
              'learning_rate':uniform(0.01,1),
              'n_estimators':uniform(80,150),
              'subsample':uniform(0.8,1)}

rand = RandomizedSearchCV(estimator=GradientBoostingClassifier(), param_distributions=param_rand, scoring=acc_score, cv=5)

rand.fit(X_train.iloc[1:100,], y_train.iloc[1:100,])

# Bayes_opt

In [None]:
# Gradient Boosting Machine
def gbm_cl_bo(max_depth, max_features, learning_rate, n_estimators, subsample):
    params_gbm = {}
    
    params_gbm['max_depth'] = round(max_depth)
    params_gbm['max_features'] = max_features
    params_gbm['learning_rate'] = learning_rate
    params_gbm['n_estimators'] = round(n_estimators)
    params_gbm['subsample'] = subsample
    
    scores = cross_val_score(GradientBoostingClassifier(random_state=123, **params_gbm),
                             X_train, y_train, scoring=acc_score, cv=5).mean()
    
    score = scores.mean()
    return score

In [None]:
# Run Bayesian Optimization
start = time.time()

params_gbm ={
    'max_depth':(3, 10),
    'max_features':(0.8, 1),
    'learning_rate':(0.01, 1),
    'n_estimators':(80, 150),
    'subsample': (0.8, 1)
}

gbm_bo = BayesianOptimization(gbm_cl_bo, params_gbm, random_state=111)
gbm_bo.maximize(init_points=20, n_iter=4)

print('It takes %s minutes' % ((time.time() - start)/60))

In [None]:
params_gbm = gbm_bo.max['params']

params_gbm['max_depth'] = round(params_gbm['max_depth'])
params_gbm['n_estimators'] = round(params_gbm['n_estimators'])

params_gbm

# hyperopt

In [None]:
# Run Bayesian Optimization from hyperopt
start = time.time()

space_lr = {'max_depth': hp.randint('max_depth', 3, 10),
            'max_features': hp.uniform('max_features', 0.8, 1),
            'learning_rate': hp.uniform('learning_rate',0.01, 1),
            'n_estimators': hp.randint('n_estimators', 80,150),
            'subsample': hp.uniform('subsample',0.8, 1)}

def gbm_cl_bo2(params):
    params = {'max_depth': params['max_depth'],
              'max_features': params['max_features'],
              'learning_rate': params['learning_rate'],
              'n_estimators': params['n_estimators'],
              'subsample': params['subsample']}
    
    gbm_bo2 = GradientBoostingClassifier(random_state=111, **params)
    
    best_score = cross_val_score(gbm_bo2, X_train, y_train, scoring=acc_score, cv=5).mean()
    
    return 1 - best_score

gbm_best_param = fmin(fn=gbm_cl_bo2,
                space=space_lr,
                max_evals=24,
                rstate=np.random.RandomState(42),
                algo=tpe.suggest)

print('It takes %s minutes' % ((time.time() - start)/60))

In [None]:
gbm_best_param