# <center>CV Xgboost

In [1]:
import os
os.chdir("..")

%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import timedelta

In [3]:
# import warnings
# warnings.simplefilter('ignore')

pd.set_option('display.max_columns', 500)

In [4]:
data_path = Path("data")
model_path = Path("model_data")

### data

In [5]:
example_predictions_name = "example_predictions.pkl"
test_features_name = "test_features.pkl"
train_features_name = "train_features.pkl"
train_outcomes_name = "train_outcomes.pkl"

In [6]:
test_features = pd.read_pickle(data_path / test_features_name)
train_features = pd.read_pickle(data_path / train_features_name)
train_outcomes = pd.read_pickle(data_path / train_outcomes_name)

In [7]:
test_features.shape, train_features.shape, train_outcomes.shape

((627908, 32), (2106808, 32), (2106808, 1))

In [8]:
from dataprocessing.process import StandardScaler, SmartLogtransformer, TrainTestSplit, Pipeline
from dataprocessing.impute import SimpleImputer

In [9]:
train_test_split = TrainTestSplit(test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(train_features, train_outcomes)

### Pipeline for data processing

In [10]:
pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")), 
        ("logtransformer", SmartLogtransformer()), 
        ("scaler", StandardScaler())
    ]
)
pipe

[(imputer, <dataprocessing.impute.SimpleImputer>), 
(logtransformer, <dataprocessing.process.SmartLogtransformer>), 
(scaler, <dataprocessing.process.StandardScaler>)]

In [11]:
%%time
X_train = pipe.fit_transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(test_features)

Wall time: 5.15 s


## <center>XGBOOST

# <center>Parameter Tuning

In [12]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

#### data

In [13]:
train_len = len(X_train)
val_len = len(X_val)
dtrain = xgb.DMatrix(pd.concat([X_train, X_val]), label=pd.concat([y_train, y_val]))
train_ind = range(train_len)
val_ind = range(train_len, train_len + val_len)
dtest = xgb.DMatrix(X_test)

#### Setting Parameters

In [24]:
params = dict(
    max_depth = 3, 
    eta = 1, 
    objective = 'binary:logistic', 
    eval_metric = 'error'
)

In [25]:
cv = xgb.cv(
    params = params, 
    dtrain = dtrain,
    num_boost_round = 20, 
    nfold = 1, 
    folds = [(train_ind, val_ind)], 
    metrics = "error", 
    early_stopping_rounds = 3
)

In [26]:
cv

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.44007,0.0,0.418982,0.0
1,0.432775,0.0,0.419302,0.0
2,0.430596,0.0,0.419746,0.0
3,0.429426,0.0,0.417728,0.0
4,0.428041,0.0,0.41853,0.0
5,0.428009,0.0,0.418452,0.0
6,0.426783,0.0,0.415939,0.0
7,0.426221,0.0,0.415573,0.0
8,0.425846,0.0,0.414827,0.0
9,0.425375,0.0,0.414966,0.0


## GridSearchCVXgb

In [48]:
from dataprocessing.process import GridSearchCVXgb

In [49]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)

In [50]:
evals = [(dtrain, 'train'), (dval, 'eval')]

In [51]:
xgb_train_params = dict(
    params = params, 
    dtrain = dtrain, 
    num_boost_round = 100, 
    evals = evals, 
    early_stopping_rounds=3, 
    verbose_eval=False
)

In [52]:
param_grid = dict(
    max_depth = [2, 3], 
    eta = [1, 2]
)

In [53]:
search = GridSearchCVXgb(
    xgb_train_params = xgb_train_params,
    param_grid = param_grid,
)

In [54]:
%%time
search.fit()

Wall time: 30 s


In [55]:
search.best_score_

0.414731

In [56]:
search.best_num_boost_round

12

In [57]:
search.best_params_

{'max_depth': 3, 'eta': 1}