## Import packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
os.chdir('/kaggle/input/jane-street-market-prediction/')
import janestreet
os.chdir('/kaggle/working')
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import shap
import tqdm

## Load data files

In [None]:
sample_prediction_df = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_sample_submission.csv', encoding = 'utf-8-sig')
features = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv', encoding = 'utf-8-sig')
test_data = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_test.csv', encoding = 'utf-8-sig')
train_data = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', encoding = 'utf-8-sig')

## EDA

In [None]:
print(train_data.shape)
train_data.head()

In [None]:
print(features.shape)
features.head()

In [None]:
print(test_data.shape)
test_data.head()

In [None]:
print(sample_prediction_df.shape)
sample_prediction_df.head()

In [None]:
train_data.describe()

In [None]:
print('Number of rows in data:', train_data.shape[0])
columns_in_train_data_nan = pd.DataFrame(train_data.isna().sum()).rename(columns = {0:'Number of NaNs'}).sort_values(by = ['Number of NaNs'], ascending = False)
columns_in_train_data_nan['% NaNs'] = (columns_in_train_data_nan['Number of NaNs']/train_data.shape[0]) * 100
columns_in_train_data_nan[columns_in_train_data_nan['Number of NaNs']>100000]

In [None]:
# Fill NaNs with mean of column:
train_data.fillna(train_data.mean(), inplace = True)

In [None]:
pd.DataFrame(train_data['date'].unique()).describe().rename(columns = {0:'Number of days'})

The data contains 500 days for trading

In [None]:
print('Number of rows with weight 0:',train_data[train_data['weight']==0].shape[0])
print('Number of rows with weight non-zero:',train_data[train_data['weight']!=0].shape[0])

Trades with weight = 0 were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation

In [None]:
features[features==True].count(axis = 1).plot()

feature_0 is the only feature without any true tag

In [None]:
train_data.groupby(['date']).size().reset_index().rename(columns = {0: '# Trades in a day'}).plot('date','# Trades in a day', title = 'Trades in a day [1-500]')

In [None]:
# Correlation analysis from <https://www.kaggle.com/isaienkov/jane-street-market-prediction-fast-understanding>

# Correlation
corr_high_columns = []
cols = train_data.columns.tolist()
for i in range(0, len(cols)):
    for j in range(i+1, len(cols)):
        if abs(train_data[cols[i]].corr(train_data[cols[j]])) > 0.95:
            corr_high_columns = corr_high_columns + [cols[i], cols[j]]

In [None]:
corr_high_columns = list(set(corr_high_columns))
print('Number of columns:', len(corr_high_columns))

In [None]:
corr_high_columns

In [None]:
#Correlation matrix
f = plt.figure(
    figsize=(22, 22)
)

plt.matshow(
    train_data[corr_high_columns].corr(), 
    fignum=f.number
)

plt.title('Correlation matrix - for corr above 0.9')
plt.xticks(
    range(train_data[corr_high_columns].shape[1]), 
    train_data[corr_high_columns].columns, 
    fontsize=14, 
    rotation=90
)

plt.yticks(
    range(train_data[corr_high_columns].shape[1]), 
    train_data[corr_high_columns].columns, 
    fontsize=14
)

cb = plt.colorbar()
cb.ax.tick_params(
    labelsize=14
)

## Modelling

In [None]:
#Action metric created using: <https://www.kaggle.com/hamditarek/market-prediction-xgboost-with-gpu-fit-in-1min>
# Create action metric
# train_data['action'] = ((train_data['weight'].values * train_data['resp'].values) > 0).astype('int')
train_data['action'] = ((train_data['weight'].values * (train_data['resp_1'] + train_data['resp_2'] + train_data['resp_3'] + train_data['resp_4']).values)/4 > 0).astype('int')

train_data_for_model = train_data[train_data['weight'] != 0]
# train_data_for_model = train_data.copy(deep = True)

X_train = train_data_for_model.loc[:, train_data_for_model.columns.str.contains('feature')]
y_train = train_data_for_model.loc[:, 'action']

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
print(y_train.shape)
print(y_train.sum())
y_train.head()

In [None]:
del columns_in_train_data_nan, train_data, features, test_data, train_data_for_model, corr_high_columns

In [None]:
import gc
gc.collect()

In [None]:
features = [c for c in X_train.columns if 'feature' in c]

In [None]:
clf = xgb.XGBClassifier(use_label_encoder=False,
    n_estimators=1000,
    max_depth=10,
    learning_rate=0.06,
    subsample=0.9,
    colsample_bytree=0.7,
    random_state=42,
    tree_method='gpu_hist'  # Treats numerical variable as bins (makes process much faster)
)

In [None]:
%time clf.fit(X_train[features], y_train)

In [None]:
import pickle
pickle.dump(clf, open('Jane_Street_forecasting_weight_xgboost_model_v1.sav','wb'))

In [None]:
# filename = '../input/jane-street-pred-model-weights/Jane_Street_forecasting_weight_xgboost_model_v1.sav'
# clf = pickle.load(open(filename, 'rb'))

In [None]:
def normalize_data(df):
#     return (df-df.min())/(df.max()-df.min())
      return (df-df.mean())/df.std()
    
df_train = normalize_data(X_train[features])

In [None]:
# import tensorflow as tf
# from keras.layers import Activation, Dense

# model = tf.keras.models.Sequential()

# model.add(tf.keras.layers.LSTM(
#     len(features), 
#     activation='relu', 
#     input_shape=(1, len(features)), 
#     return_sequences=True))

# model.add(tf.keras.layers.Dropout(0.02))

# model.add(Dense(50, activation='swish',input_shape=(len(features), )))

# model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

# model.compile(loss=tf.keras.losses.BinaryCrossentropy(), 
#                 optimizer=tf.optimizers.Adam(learning_rate=0.05),
#                 metrics=["accuracy"])
# model.summary()

In [None]:
# from tensorflow.keras.callbacks import EarlyStopping
# model.fit(df_train,
#             epochs = 1,
#             batch_size = 10000,
#             verbose = 1,
#             callbacks = [EarlyStopping(monitor='loss', verbose=1, patience=10)])

## Feature importance

In [None]:
# plot feature importance using built-in function
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
fig, ax = plt.subplots(figsize=(20,30))
plot_importance(clf, ax = ax)
pyplot.show()

In [None]:
#SHAP plots
# Create object that can calculate shap values
explainer = shap.TreeExplainer(clf)

df = X_train.sample(n=1000)
# calculate shap values. This is what we will plot.
shap_values = explainer.shap_values(df)

# Make plot
shap.summary_plot(shap_values, df)

Features 39, 64 & 20 strictly increase the action probability. The other features may be dependent on each other. Dimensionality reduction is required to train a better model.

## Prediction

In [None]:
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

# count = 0
for (test_df, sample_prediction_df) in iter_test:
    if test_df['weight'].item() > 0:
        X_test = test_df.loc[:, features]
        X_test = X_test.fillna(0)
#         print(X_test.shape)
        y_preds = clf.predict(X_test)
        sample_prediction_df.action = y_preds.astype(int)
    else:
        sample_prediction_df.action = 0
    env.predict(sample_prediction_df)