In this notebook we train and save a GBDT model for the HELOC dataset.

## Table of contents

* [Setup](#setup)
* [Data loading](#loading)
* [Data preprocessing](#preprocessing)
* [Model training](#training)


## Setup <a class="anchor" id="setup"></a>

In [1]:
import os

while "notebooks" in os.getcwd():
    os.chdir("../")

from pathlib import Path
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

%load_ext autoreload
%autoreload 2


In [17]:
dataset_path = Path("data")
old_dataset_name = "heloc"
new_dataset_name = "heloc-scaled"
model_path = Path("models")
model_name = f"{new_dataset_name}-gbdt"
do_training = False


## Data loading <a class="anchor" id="loading"></a>

In [3]:
train_df = pd.read_csv(dataset_path / f"{old_dataset_name}-train.csv")
test_df = pd.read_csv(dataset_path / f"{old_dataset_name}-test.csv")


### Data inspection

In [4]:
train_df


Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance
0,67.0,282.0,11.0,108.0,37.0,0.0,0.0,95.0,3.0,4.0,...,0.0,1.0,1.0,68.0,58.000000,5.0,3.000000,2.0,80.0,0.0
1,58.0,164.0,4.0,62.0,21.0,2.0,1.0,70.0,8.0,4.0,...,0.0,5.0,5.0,27.0,94.000000,5.0,4.000000,0.0,82.0,0.0
2,71.0,103.0,15.0,94.0,6.0,1.0,0.0,86.0,13.0,6.0,...,24.0,0.0,0.0,48.0,70.210422,2.0,2.563951,0.0,67.0,0.0
3,85.0,139.0,11.0,78.0,10.0,0.0,0.0,100.0,83.0,7.0,...,24.0,2.0,1.0,19.0,66.516262,1.0,2.397238,1.0,50.0,1.0
4,79.0,803.0,3.0,101.0,38.0,0.0,0.0,100.0,83.0,7.0,...,3.0,2.0,2.0,14.0,66.516262,4.0,2.000000,0.0,46.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7891,69.0,292.0,6.0,121.0,28.0,3.0,3.0,90.0,64.0,6.0,...,0.0,4.0,4.0,18.0,93.000000,1.0,4.000000,0.0,55.0,0.0
7892,76.0,201.0,8.0,71.0,38.0,0.0,0.0,97.0,51.0,6.0,...,0.0,0.0,0.0,0.0,47.000000,1.0,5.000000,0.0,35.0,1.0
7893,58.0,211.0,5.0,54.0,39.0,0.0,0.0,95.0,2.0,4.0,...,0.0,1.0,1.0,25.0,43.000000,5.0,3.000000,3.0,47.0,0.0
7894,69.0,78.0,5.0,40.0,16.0,0.0,0.0,100.0,83.0,7.0,...,0.0,2.0,2.0,69.0,70.210422,3.0,2.000000,2.0,83.0,0.0


In [5]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7896 entries, 0 to 7895
Data columns (total 24 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ExternalRiskEstimate                7896 non-null   float64
 1   MSinceOldestTradeOpen               7896 non-null   float64
 2   MSinceMostRecentTradeOpen           7896 non-null   float64
 3   AverageMInFile                      7896 non-null   float64
 4   NumSatisfactoryTrades               7896 non-null   float64
 5   NumTrades60Ever2DerogPubRec         7896 non-null   float64
 6   NumTrades90Ever2DerogPubRec         7896 non-null   float64
 7   PercentTradesNeverDelq              7896 non-null   float64
 8   MSinceMostRecentDelq                7896 non-null   float64
 9   MaxDelq2PublicRecLast12M            7896 non-null   float64
 10  MaxDelqEver                         7896 non-null   float64
 11  NumTotalTrades                      7896 no

## Data preprocessing <a class="anchor" id="preprocessing"></a>

In [6]:
# the last column is the target
X_train = train_df.iloc[:, :-1]
X_test = test_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
y_test = test_df.iloc[:, -1]


### Data normalization

`StandardScaler` calculates the mean and variance of each column from `X_train`. They are then used for normalization of `X_train` and `X_test`.


In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))
X_train_scaled.columns = X_train.columns
X_test_scaled.columns = X_test.columns


### Saving

In [8]:
normalization_params = {
    "mean": scaler.mean_,
    "variance": scaler.var_,
}


In [9]:
normalization_params_df = pd.DataFrame.from_dict(normalization_params, orient="index")
normalization_params_df.columns = X_train.columns
normalization_params_df


Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
mean,72.114191,201.079578,9.761018,78.945162,21.087893,0.568389,0.374747,92.400963,50.8264,5.756966,...,34.648303,6.591525,1.463019,1.403242,34.849802,68.41398,4.087038,2.475941,1.092201,66.38093
variance,96.786317,9418.94297,179.998232,1167.955326,126.633359,1.420855,0.900472,138.309341,1154.616004,2.707272,...,325.664683,88.266024,4.792959,4.623617,823.090413,390.301794,8.832372,2.387937,2.175593,484.997121


In [10]:
normalization_params_df.to_csv(dataset_path / f"{old_dataset_name}-scaling-params.csv")


In [11]:
def save_scaled_dataset(save_path: Path, X: pd.DataFrame, y: pd.DataFrame):
    df_scaled = pd.concat([X, y], axis=1)
    df_scaled.to_csv(save_path, index=False)


In [12]:
save_scaled_dataset(dataset_path / f"{new_dataset_name}-train.csv",
                    X_train_scaled, y_train)
save_scaled_dataset(dataset_path / f"{new_dataset_name}-test.csv",
                    X_test_scaled, y_test)


## Model training <a class="anchor" id="training"></a>


In [13]:
# training hyperparameters
param = {
    'eta': 0.01,
    'max_depth': 4,
    'objective': 'binary:logistic',
    'seed': 42,
}
steps = 500


In [14]:
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)


In [15]:
%%time
if do_training:
    gbdt_model = xgb.train(param, dtrain,
                      num_boost_round=steps,
                      evals=[(dtest, 'test'), (dtrain, 'train')],
                      verbose_eval=50)


[0]	test-logloss:0.68849	train-logloss:0.68836
[50]	test-logloss:0.53465	train-logloss:0.53052
[100]	test-logloss:0.46158	train-logloss:0.45494
[150]	test-logloss:0.42393	train-logloss:0.41524
[200]	test-logloss:0.40429	train-logloss:0.39178
[250]	test-logloss:0.38740	train-logloss:0.37176
[300]	test-logloss:0.37666	train-logloss:0.35775
[350]	test-logloss:0.36913	train-logloss:0.34699
[400]	test-logloss:0.36355	train-logloss:0.33859
[450]	test-logloss:0.35930	train-logloss:0.33150
[499]	test-logloss:0.35676	train-logloss:0.32597
CPU times: user 3h 30min 2s, sys: 13min 23s, total: 3h 43min 26s
Wall time: 4min


### Model saving/loading
* `save_model()` is for saving and loading.
* `dump_model()` is for model exporting which may be used for further model interpretation, for example visualization.

In [16]:
if do_training:
    gbdt_model.save_model(model_path / f"{model_name}_saved.json")
    gbdt_model.dump_model(model_path / f"{model_name}_dumped.txt", with_stats=True)
else:
    gbdt_model = xgb.Booster()
    gbdt_model.load_model(model_path / f"{model_name}_saved.json")
    