# JS - Mutual Info Selection + Random Forest

In [None]:
import os

import numpy as np
import pandas as pd

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score

from xgboost import XGBClassifier

In [None]:
mi = "  0  61  60  62  63  45  41  44  42   6   4  40  38  39  37   5  18  27 \
       28  17  91   3  85  97   7 103  94   8  79  96  84  90 106  72 102  78 \
      108 100 114  88  73  71  82  70  47  46  50  48  76  22  54  31  32  93 \
       21 105  43  87  81  49 115  99  92 118  98 104  80 109  86  75  11  12 \
       24  74  33  34  23 112 117  26 116 110 121  35  14  69 123  55 111  13 \
       25  36  20 120  15  29  19  16  52  30  10  95  83   9   2  51  89 101 \
       64   1  53  77  68 107  67  66 129 122 125 119 113  56 127  65  58  57 \
      128 124 126  59".split()

sorted_features = ["feature_" + x for x in mi]

In [None]:
# read data as 32 bit floats
file = os.path.join(os.pardir, "input", "jane-street-market-prediction", "train.csv")
dtype = {c: np.float32 for c in pd.read_csv(file, nrows=1).columns}
full_df = pd.read_csv(file, engine="c", dtype=dtype)

In [None]:
# impute missing values with -100.0
full_df.fillna(-100.0, inplace=True)

# split into training and validation
train_df = full_df[full_df["date"].between(86, 375)]
valid_df = full_df[full_df["date"].between(425, 500)]

# build features and labels
features = sorted_features[:36]
train_X = train_df[features].to_numpy()
valid_X = valid_df[features].to_numpy()
train_y = train_df["resp"].gt(0.0).astype(np.float32).to_numpy()
valid_y = valid_df["resp"].gt(0.0).astype(np.float32).to_numpy()

print(f"Class imbalance: {train_y.mean()}")

In [None]:
model = RandomForestClassifier(n_estimators=10, criterion="entropy", max_depth=16, random_state=13, n_jobs=-1, verbose=3)
model.fit(train_X, train_y)

In [None]:
probs = model.predict_proba(valid_X)
probs = probs[:, 1]

# precision vs recall
precisions, recalls, thresholds = precision_recall_curve(valid_y, probs)

plt.figure(figsize=(8, 5))
plt.plot(thresholds, precisions[:-1], "tab:blue", label="Precision")
plt.plot(thresholds, recalls[:-1], "tab:orange", label="Recall")
plt.legend()
plt.xlabel("Threshold")
plt.title("Precision/recall at threshold")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(recalls, precisions, "tab:blue")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision at recall")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

# ROC curve
false_positives, true_positives, thresholds = roc_curve(valid_y, probs)
plt.figure(figsize=(8, 5))
plt.plot(false_positives, true_positives, "tab:blue")
plt.plot([0, 1], [0, 1], "tab:gray")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC curve")
plt.axis([0, 1, 0, 1])
plt.grid(True)
plt.show()

# AUC
print(f"AUC: {roc_auc_score(valid_y, probs)}")