In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import necessary packages

In [None]:
import os
os.chdir('/kaggle/input/jane-street-market-prediction/')
import janestreet
os.chdir('/kaggle/working')
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import shap

## Load the data files

In [None]:
sample_prediction_df = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_sample_submission.csv', encoding = 'utf-8-sig')
features = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv', encoding = 'utf-8-sig')
test_data = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_test.csv', encoding = 'utf-8-sig')
train_data = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', encoding = 'utf-8-sig')

## EDA

In [None]:
print(train_data.shape)
train_data.head()

In [None]:
print(features.shape)
features.head()

In [None]:
print(test_data.shape)
test_data.head()

In [None]:
print(sample_prediction_df.shape)
sample_prediction_df.head()

In [None]:
train_data.describe()

In [None]:
print('Number of rows in data:', train_data.shape[0])
columns_in_train_data_nan = pd.DataFrame(train_data.isna().sum()).rename(columns = {0:'Number of NaNs'}).sort_values(by = ['Number of NaNs'], ascending = False)
columns_in_train_data_nan['% NaNs'] = (columns_in_train_data_nan['Number of NaNs']/train_data.shape[0]) * 100
columns_in_train_data_nan[columns_in_train_data_nan['Number of NaNs']>100000]

In [None]:
# Fill NaNs with mean of column:
train_data.fillna(train_data.mean(), inplace = True)

In [None]:
pd.DataFrame(train_data['date'].unique()).describe().rename(columns = {0:'Number of days'})

The data contains 500 days for trading

In [None]:
print('Number of rows with weight 0:',train_data[train_data['weight']==0].shape[0])
print('Number of rows with weight non-zero:',train_data[train_data['weight']!=0].shape[0])

Trades with weight = 0 were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation

In [None]:
features[features==True].count(axis = 1).plot()

feature_0 is the only feature without any true tag

In [None]:
train_data.groupby(['date']).size().reset_index().rename(columns = {0: '# Trades in a day'}).plot('date','# Trades in a day', title = 'Trades in a day [1-500]')

In [None]:
# Correlation analysis from <https://www.kaggle.com/isaienkov/jane-street-market-prediction-fast-understanding>

# Correlation
corr_high_columns = []
cols = train_data.columns.tolist()
for i in range(0, len(cols)):
    for j in range(i+1, len(cols)):
        if abs(train_data[cols[i]].corr(train_data[cols[j]])) > 0.95:
            corr_high_columns = corr_high_columns + [cols[i], cols[j]]

In [None]:
corr_high_columns = list(set(corr_high_columns))
print('Number of columns:', len(corr_high_columns))

In [None]:
corr_high_columns

In [None]:
#Correlation matrix
f = plt.figure(
    figsize=(22, 22)
)

plt.matshow(
    train_data[corr_high_columns].corr(), 
    fignum=f.number
)

plt.title('Correlation matrix - for corr above 0.9')
plt.xticks(
    range(train_data[corr_high_columns].shape[1]), 
    train_data[corr_high_columns].columns, 
    fontsize=14, 
    rotation=90
)

plt.yticks(
    range(train_data[corr_high_columns].shape[1]), 
    train_data[corr_high_columns].columns, 
    fontsize=14
)

cb = plt.colorbar()
cb.ax.tick_params(
    labelsize=14
)

In [None]:
#Action metric created using: <https://www.kaggle.com/hamditarek/market-prediction-xgboost-with-gpu-fit-in-1min>
# Create action metric
# train_data['action'] = ((train_data['weight'].values * train_data['resp'].values) > 0).astype('int')
train_data['action'] = ((train_data['weight'].values * (train_data['resp_1'] + train_data['resp_2'] + train_data['resp_3'] + train_data['resp_4']).values)/4 > 0).astype('int')

train_data_for_model = train_data[train_data['weight'] != 0]
# train_data_for_model = train_data.copy(deep = True)

X_train = train_data_for_model.loc[:, train_data_for_model.columns.str.contains('feature')]
y_train = train_data_for_model.loc[:, 'action']

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
print(y_train.shape)
print(y_train.sum())
y_train.head()

In [None]:
del columns_in_train_data_nan, train_data, features, test_data, train_data_for_model, corr_high_columns

In [None]:
import gc
gc.collect()

## Model

In [None]:
# clf = xgb.XGBClassifier(use_label_encoder=False,
#     n_estimators=500,
#     max_depth=10,
#     learning_rate=0.06,
#     subsample=0.9,
#     colsample_bytree=0.7,
#     random_state=42,
#     tree_method='gpu_hist'  # Treats numerical variable as bins (makes process much faster)
# )

In [None]:
# %time clf.fit(X_train, y_train)

In [None]:
import pickle
filename = '../input/jane-street-pred-model-weights/Jane_Street_stock_market_pred.sav'
# pickle.dump(clf, open(filename, 'wb'))

In [None]:
clf = pickle.load(open(filename, 'rb'))

## Feature importance

In [None]:
# #SHAP plots
# # Create object that can calculate shap values
# explainer = shap.TreeExplainer(clf)

# df = X_train.sample(n=1000)
# # calculate shap values. This is what we will plot.
# shap_values = explainer.shap_values(df)

# # Make plot
# shap.summary_plot(shap_values, df)

Features 39, 64 & 20 strictly increase the action probability. The other features may be dependent on each other. Dimensionality reduction is required to train a better model.

## Prediction

In [None]:
# submission_df = pd.DataFrame(data = None, columns = ['action'])

In [None]:
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

# count = 0
for (test_df, sample_prediction_df) in env.iter_test():
    if test_df['weight'].item() > 0:
        X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
        X_test = X_test.fillna(0)
        y_preds = clf.predict(X_test)
        sample_prediction_df.action = y_preds.astype(int)
    else:
        sample_prediction_df.action = 0
    env.predict(sample_prediction_df)