# Bagging Experiments

In this notebook we try to address the overfitting problem of the gradient boosted trees in the `naive.ipynb` notebook by implementing some suggestions from the book by Lopez de Prado. We still only use the features at time $t$ to predict the response at time $t$.

In [None]:
%%capture
%pip install datatable

import os
from time import time

import numpy as np

import datatable as dt
import pandas as pd

from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

Load the data.

In [None]:
# location of data files
comp_folder = os.path.join(os.pardir, "input", "jane-street-market-prediction")

# read the data with datatables, then convert to pandas (faster)
df = dt.fread(os.path.join(comp_folder, "train.csv")).to_pandas()
df.set_index("ts_id", inplace=True)

# reduce memory usage
df = df.astype({c: np.float32 for c in df.select_dtypes(include="float64").columns})

# split by date, to reduce temporal correlations between training/test
train_df = df[df["date"] < 350]
test_df = df[df["date"] >= 400]

# split into features and target
feat_cols = [c for c in train_df.columns if "feature" in c]
train_X = train_df[feat_cols]
test_X = test_df[feat_cols]
train_y = train_df["resp"]
test_y = test_df["resp"]
train_weights = train_df["weight"]
test_weights = test_df["weight"]

Train a model.

In [None]:
# z-score the targets
train_y = (train_y - train_y.mean()) / train_y.std()

# targets as classification problem
train_y_pos = train_y.gt(0).astype(int)

# replace missing values by median
imp = SimpleImputer(strategy="median")
flow = imp.fit_transform(train_X)

# z-score the features
ss = StandardScaler()
flow = ss.fit_transform(flow)

# rotate features onto directions that cause maximal
# variance in the response
pls = PLSRegression(n_components=60) # 40 PCA components carry 95% variance
pls.fit(flow, train_y)
flow = pls.transform(flow)

# clf = RandomForestClassifier(n_estimators=1000, 
#                              max_depth=10,
#                              max_features="log2",
#                              min_weight_fraction_leaf=0.05,
#                              class_weight="balanced_subsample",
#                              criterion="entropy",
#                              random_state=42
#                             )

# re-z-score the features for SVM
ss2 = StandardScaler()
flow = ss2.fit_transform(flow)

# bag several support vector classifiers
# the classifiers stop early and are trained on restricted
# samples/features
clf = BaggingClassifier(base_estimator=SVC(max_iter=100000),
                        n_estimators=300,
                        max_samples=10000,
                        max_features=10,
                        bootstrap_features=True)

tick = time()
clf.fit(flow, train_y_pos)
tock = time()
print(f"Training took {(tock-tick) // 60} minutes")

pred = clf.predict(flow)

# metrics on training data
print("TRAINING SET:")
print(f"Confusion matrix:")
print(confusion_matrix(train_y_pos, pred))
print(f"Precision: {precision_score(train_y_pos, pred)}")
print(f"Recall: {recall_score(train_y_pos, pred)}")
print(f"F1: {f1_score(train_y_pos, pred)}")

Evaluate on test set.

In [None]:
flow = imp.transform(test_X)
flow = ss.transform(flow)
flow = pls.transform(flow)
flow = ss2.transform(flow)
pred = clf.predict(flow)

# metrics on test set
test_y_pos = test_y.gt(0).astype(int)
print("TEST SET:")
print(f"Confusion matrix:")
print(confusion_matrix(test_y_pos, pred))
print(f"Precision: {precision_score(test_y_pos, pred)}")
print(f"Recall: {recall_score(test_y_pos, pred)}")
print(f"F1: {f1_score(test_y_pos, pred)}")