# <center>Xgboost GridSearch

In [1]:
import os
os.chdir("..")

%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import timedelta

In [3]:
# import warnings
# warnings.simplefilter('ignore')

pd.set_option('display.max_columns', 500)

In [4]:
data_path = Path("data")
model_path = Path("model_data")

### data

In [5]:
example_predictions_name = "example_predictions.pkl"
test_features_name = "test_features.pkl"
train_features_name = "train_features.pkl"
train_outcomes_name = "train_outcomes.pkl"

In [6]:
test_features = pd.read_pickle(data_path / test_features_name)
train_features = pd.read_pickle(data_path / train_features_name)
train_outcomes = pd.read_pickle(data_path / train_outcomes_name)

In [7]:
test_features.shape, train_features.shape, train_outcomes.shape

((627908, 32), (2106808, 32), (2106808, 1))

In [8]:
from dataprocessing.process import StandardScaler, SmartLogtransformer, TrainTestSplit, Pipeline
from dataprocessing.impute import SimpleImputer

In [9]:
train_test_split = TrainTestSplit(test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(train_features, train_outcomes)

### Pipeline for data processing

In [10]:
pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")), 
        ("logtransformer", SmartLogtransformer()), 
        ("scaler", StandardScaler())
    ]
)
pipe

[(imputer, <dataprocessing.impute.SimpleImputer>), 
(logtransformer, <dataprocessing.process.SmartLogtransformer>), 
(scaler, <dataprocessing.process.StandardScaler>)]

In [11]:
%%time
X_train = pipe.fit_transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(test_features)

Wall time: 5.22 s


## <center>XGBOOST

# <center>Parameters Tuning

## GridSearchXgb

In [12]:
import xgboost as xgb

#### Setting Parameters

In [13]:
from dataprocessing.process import GridSearchXgb

In [14]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)

In [15]:
evals = [(dtrain, 'train'), (dval, 'eval')]

In [47]:
params = dict(
    objective = 'binary:logistic', 
    eval_metric = 'error'
)

In [48]:
xgb_train_params = dict(
    params = params, 
    dtrain = dtrain, 
    num_boost_round = 100, 
    evals = evals, 
    early_stopping_rounds=3, 
    verbose_eval=False
)

In [49]:
param_grid_total = dict(
    eta = [0.01, 0.05, 0.1, 0.5, 1, 2, 3],
    max_depth = range(3, 10),
    min_child_weight = range(1, 8), 
    subsample = [0.5, 1], 
    colsample_bytree = [0.5, 1], 
    alpha = [0.3, 0.65, 1]
)

In [50]:
def params_tunning(param_grid_total):
    for param in param_grid_total:
        param_grid = {param: param_grid_total[param]}
        search = GridSearchXgb(
            xgb_train_params = xgb_train_params,
            param_grid = param_grid,
        )
        search.fit()
        print(f"best_params: {search.best_params_}, best score: {search.best_score_}, best num round: {search.best_num_boost_round}")
        # set params
        for key in search.best_params_:
            params[key] = search.best_params_[key]
    return search.best_score_, params

In [51]:
best_score, best_params = params_tunning(param_grid_total)

best_params: {'eta': 0.1}, best score: 0.407981, best num round: 8
best_params: {'max_depth': 5}, best score: 0.407465, best num round: 10
best_params: {'min_child_weight': 1}, best score: 0.407465, best num round: 10
best_params: {'subsample': 0.5}, best score: 0.407168, best num round: 13
best_params: {'colsample_bytree': 1}, best score: 0.407168, best num round: 13
best_params: {'alpha': 1}, best score: 0.406794, best num round: 14


In [52]:
best_score

0.406794

In [53]:
print("best accuracy:", 1 - best_score)

best accuracy: 0.593206


In [54]:
best_params

{'objective': 'binary:logistic',
 'eval_metric': 'error',
 'eta': 0.1,
 'max_depth': 5,
 'min_child_weight': 1,
 'subsample': 0.5,
 'colsample_bytree': 1,
 'alpha': 1}