In [None]:
import sys, os
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 

%matplotlib inline

In [None]:
DATA_DIR = "../input/jane-street-market-prediction"

In [None]:
train_features_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))

In [None]:
# initial data preprocessing
train_features_df.fillna(-999, inplace=True)
train_features_df.drop(train_features_df[train_features_df['weight'] == 0].index, inplace=True)
# set the action variable based on resp
train_features_df['action'] = (train_features_df['resp'] > 0).astype('int')

In [None]:
train_features = np.array([c for c in train_features_df if 'feature' in c])

In [None]:
# get the X and y arrays
X = train_features_df.values[:, 7:-2]
y = train_features_df.values[:, -1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.05,
    random_state=2020,
    stratify=y
)

In [None]:
# create a RandomForestClassifier
clf = RandomForestClassifier(
    criterion='gini',
    n_estimators=100,
    random_state=2020,
    n_jobs=-1
)

In [None]:
# clf.fit(X_train, y_train)
clf = joblib.load('../input/jsmp-random-forest/random_forest.bin')

In [None]:
# get the feature importances (returns importance value for each indices)
feature_importances = clf.feature_importances_

In [None]:
feature_imp_idx = np.argsort(feature_importances)[::-1]

In [None]:
# print the top k features
k = 20
for i in range(k):
    print(f"{train_features[feature_imp_idx[i]]}\t {feature_importances[feature_imp_idx[i]]}")

In [None]:
# plot the top k features
plt.figure(figsize=(15, 10))
plt.title(f"Top {k} features")
plt.bar(train_features[feature_imp_idx][:k], feature_importances[feature_imp_idx][:k])
plt.xticks(rotation=90)
plt.show()