In [None]:
# https://www.kaggle.com/hamditarek/market-prediction-xgboost-with-gpu-fit-in-1min

import os

import numpy as np
import pandas as pd

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

from xgboost import XGBClassifier

# read data
comp_folder = os.path.join(os.pardir, "input", "jane-street-market-prediction")
df = pd.read_csv(os.path.join(comp_folder, "train.csv"))
df = df.astype({c: np.float32 for c in df.select_dtypes(include="float64").columns})

# set missing values to -999
df.fillna(-999, inplace=True)

# split by date to reduce temporal correlations between training/test
train_df = df[df["date"] < 350]
test_df = df[df["date"] >= 400]

# For training only look at data that has weight
train_df = train_df[train_df.weight != 0]

# Split into features and labels
features = [c for c in df.columns if "feature" in c]
train_X = train_df[features]
train_y = (train_df["resp"] > 0).astype(int)

# fit gradient boosted decision trees
model = XGBClassifier(
    n_estimators=500,
    max_depth=11,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.7,
    missing=-999,
    random_state=2020,
    tree_method="gpu_hist"
)
model.fit(train_X, train_y)

# metrics on training data
train_preds = model.predict(train_X)

print("TRAINING SET:")
print(f"Confusion matrix:")
print(confusion_matrix(train_y, train_preds))
print(f"Precision: {precision_score(train_y, train_preds)}")
print(f"Recall: {recall_score(train_y, train_preds)}")
print(f"F1: {f1_score(train_y, train_preds)}\n\n")

# metrics on test data
test_X = test_df[features]
test_y = (test_df["resp"] > 0).astype(int)
test_preds = model.predict(test_X)

print("TEST SET:")
print(f"Confusion matrix:")
print(confusion_matrix(test_y, test_preds))
print(f"Precision: {precision_score(test_y, test_preds)}")
print(f"Recall: {recall_score(test_y, test_preds)}")
print(f"F1: {f1_score(test_y, test_preds)}")