In [None]:
import json
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import mutual_info_classif

In [None]:
# read data as 32 bit floats
file = os.path.join(os.pardir, "input", "jane-street-market-prediction", "train.csv")
dtype = {c: np.float32 for c in pd.read_csv(file, nrows=1).columns}
full_df = pd.read_csv(file, engine="c", dtype=dtype)

In [None]:
# split into training and validation
train_df = full_df[full_df["date"].between(86, 375)]

# fill missing values with -100.0
train_df = train_df.fillna(-100.0)

# build features and labels
features = [c for c in train_df.columns if "feature" in c]
train_X = train_df[features].to_numpy()
train_y = train_df["resp"].gt(0.0).astype(np.uint8).to_numpy()

In [None]:
# estimate and plot mutual informations
mi = mutual_info_classif(train_X, train_y, discrete_features=False, random_state=13)

plt.figure(figsize=(20, 8))
ax = sns.barplot(x=np.arange(mi.size), y=mi)
ax.set_xlabel("Feature")    
ax.set_ylabel("Mutual Information with resp")
ax.set(xticklabels=[])
ax.tick_params(bottom=False)
plt.show()

print("Features sorted by mutual information (descending):")
print(np.argsort(-mi))

# save sorted features
sorted_features = [f"feature_{x}" for x in np.argsort(-mi)]
feat_dict = {"sorted_features": sorted_features}
with open(os.path.join(os.curdir, "sorted_features.json"), "w") as file:
    json.dump(feat_dict, file)