# JS - Mutual Info Selection + Random Forest

In [None]:
import json
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit

from xgboost import XGBClassifier

In [None]:
# number of features to use
FEATURES = 40

# parameters to try for models
PARAMS = {"max_depth": [5, 6, 7, 8, 9, 10, 11]}

In [None]:
# read data as 32 bit floats
file = os.path.join(os.pardir, "input", "jane-street-market-prediction", "train.csv")
dtype = {c: np.float32 for c in pd.read_csv(file, nrows=1).columns}
full_df = pd.read_csv(file, engine="c", dtype=dtype)

In [None]:
# impute missing values with -100.0
full_df.fillna(-100.0, inplace=True)

# split into training and validation
train_df = full_df[full_df["date"].between(86, 375)]
valid_df = full_df[full_df["date"].between(425, 500)]

# get the features sorted by mutual information score
mi_folder = os.path.join(os.pardir, "input", "jane-street-mutual-info")

with open(os.path.join(mi_folder, "sorted_features.json")) as f:
    sf_dict = json.loads(f.read())

sorted_features = sf_dict["sorted_features"]

# build features and labels
features = sorted_features[:FEATURES]
train_X = train_df[features].to_numpy()
valid_X = valid_df[features].to_numpy()
train_y = train_df["resp"].gt(0.0).astype(np.float32).to_numpy()
valid_y = valid_df["resp"].gt(0.0).astype(np.float32).to_numpy()

In [None]:
# base model
clf = XGBClassifier(n_estimators=100, tree_method="gpu_hist", random_state=13)

# setup the data for parameter search
full_X = np.vstack([train_X, valid_X])
full_y = np.append(train_y, valid_y)
test_fold = np.append(np.full(train_X.shape[0], -1, dtype=np.int), np.zeros(valid_X.shape[0], dtype=np.int))
ps = PredefinedSplit(test_fold)

# find the hyper-parameters with best AUC score
cv = GridSearchCV(clf, PARAMS, cv=ps, scoring="roc_auc", refit=False, verbose=4)
cv.fit(full_X, full_y)

# need to refit the best model manually params, since the
# built-in refitting will refit on full_X, full_y
model = XGBClassifier(n_estimators=100, tree_method="gpu_hist", random_state=13, **cv.best_params_)
model.fit(train_X, train_y)

# show the best parameters
print(f"Highest AUC = {round(cv.best_score_, 4)} with parameters:")
print(cv.best_params_)

In [None]:
probs = model.predict_proba(valid_X)
probs = probs[:, 1]

# precision vs recall
precisions, recalls, thresholds = precision_recall_curve(valid_y, probs)

plt.figure(figsize=(8, 5))
plt.plot(thresholds, precisions[:-1], "tab:blue", label="Precision")
plt.plot(thresholds, recalls[:-1], "tab:orange", label="Recall")
plt.legend()
plt.xlabel("Threshold")
plt.title("Validation Precision/recall at threshold")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(recalls, precisions, "tab:blue")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Validation  Precision at recall")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

# ROC curve
false_positives, true_positives, thresholds = roc_curve(valid_y, probs)
plt.figure(figsize=(8, 5))
plt.plot(false_positives, true_positives, "tab:blue")
plt.plot([0, 1], [0, 1], "tab:gray")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("Validation ROC curve")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

# AUC
print(f"Validation AUC: {roc_auc_score(valid_y, probs)}")

In [None]:
probs = model.predict_proba(train_X)
probs = probs[:, 1]

# precision vs recall
precisions, recalls, thresholds = precision_recall_curve(train_y, probs)

plt.figure(figsize=(8, 5))
plt.plot(thresholds, precisions[:-1], "tab:blue", label="Precision")
plt.plot(thresholds, recalls[:-1], "tab:orange", label="Recall")
plt.legend()
plt.xlabel("Threshold")
plt.title("Training Precision/recall at threshold")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(recalls, precisions, "tab:blue")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Training  Precision at recall")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

# ROC curve
false_positives, true_positives, thresholds = roc_curve(train_y, probs)
plt.figure(figsize=(8, 5))
plt.plot(false_positives, true_positives, "tab:blue")
plt.plot([0, 1], [0, 1], "tab:gray")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("Training ROC curve")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

# AUC
print(f"Training AUC: {roc_auc_score(train_y, probs)}")

In [None]:
# save model and features
feat_dict = {"features": features}
with open(os.path.join(os.curdir, "features.json"), "w") as file:
    json.dump(feat_dict, file)

model.save_model("model.xgb")