# <center>Xgboost GridSearch

In [1]:
import os
os.chdir("..")

%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import timedelta

In [3]:
# import warnings
# warnings.simplefilter('ignore')

pd.set_option('display.max_columns', 500)

In [4]:
data_path = Path("data")
model_path = Path("model_data")

### data

In [5]:
example_predictions_name = "example_predictions.pkl"
test_features_name = "test_features.pkl"
train_features_name = "train_features.pkl"
train_outcomes_name = "train_outcomes.pkl"

In [6]:
test_features = pd.read_pickle(data_path / test_features_name)
train_features = pd.read_pickle(data_path / train_features_name)
train_outcomes = pd.read_pickle(data_path / train_outcomes_name)

In [7]:
test_features.shape, train_features.shape, train_outcomes.shape

((627908, 32), (2106808, 32), (2106808, 1))

In [8]:
from dataprocessing.process import StandardScaler, SmartLogtransformer, TrainTestSplit, Pipeline
from dataprocessing.impute import SimpleImputer

In [9]:
train_test_split = TrainTestSplit(test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(train_features, train_outcomes)

### Pipeline for data processing

In [10]:
pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")), 
        ("logtransformer", SmartLogtransformer()), 
        ("scaler", StandardScaler())
    ]
)
pipe

[(imputer, <dataprocessing.impute.SimpleImputer>), 
(logtransformer, <dataprocessing.process.SmartLogtransformer>), 
(scaler, <dataprocessing.process.StandardScaler>)]

In [11]:
%%time
X_train = pipe.fit_transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(test_features)

Wall time: 10.9 s


## <center>XGBOOST

# <center>Parameters Tuning

## GridSearchXgb

In [12]:
import xgboost as xgb

#### Setting Parameters

In [103]:
from dataprocessing.process import GridSearchXgb

In [17]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)

In [18]:
evals = [(dtrain, 'train'), (dval, 'eval')]

In [25]:
params = dict(
    objective = 'binary:logistic', 
    eval_metric = 'error'
)

In [66]:
xgb_train_params = dict(
    params = params, 
    dtrain = dtrain, 
    num_boost_round = 100, 
    evals = evals, 
    early_stopping_rounds=3, 
    verbose_eval=False
)

In [94]:
param_grid = dict(
    max_depth = range(1, 3), 
    eta = [1, 2]
)

In [95]:
search = GridSearchXgb(
    xgb_train_params = xgb_train_params,
    param_grid = param_grid,
    verbose = True
)

In [96]:
%%time
search.fit()

total: 4
....
Wall time: 14.3 s


In [97]:
search.best_score_

0.412205

In [98]:
search.best_num_boost_round

8

In [99]:
search.best_params_

{'max_depth': 1, 'eta': 1}

In [100]:
search.cv_results_df_

Unnamed: 0_level_0,score
"max_depth, eta",Unnamed: 1_level_1
"(1, 1)",0.412205
"(1, 2)",0.43087
"(2, 1)",0.415137
"(2, 2)",0.43087


In [102]:
search.best_xgb_train_params

{'params': {'max_depth': 1, 'eta': 1},
 'dtrain': <xgboost.core.DMatrix at 0x2034fdfa2e0>,
 'num_boost_round': 100,
 'evals': [(<xgboost.core.DMatrix at 0x2034fdfa2e0>, 'train'),
  (<xgboost.core.DMatrix at 0x2034fdfa3d0>, 'eval')],
 'early_stopping_rounds': 3,
 'verbose_eval': False}