<h2><center>Jane Street Market Prediction. Fast understanding.</center></h2>

<center><img src="https://karijere.fer.hr/wp-content/uploads/2018/10/Jane-Street.jpg"></center>

<div class="list-group" id="list-tab" role="tablist">
<h2 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0; color:white' role="tab" aria-controls="home"><center>Quick navigation</center></h2>

* [1. Train set description](#1)
* [2. Features description](#2)
* [3. Test example description](#3)
* [4. Sample submission example](#4)
* [5. TBD](#5)

In [None]:
import numpy as np
import pandas as pd

import janestreet

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import optuna
from optuna.samplers import TPESampler

In [None]:
pd.set_option('display.max_columns', None)

<a id="1"></a>
<h2 style='background:black; border:0; color:white'><center>1. Train set description</center><h2>

**train.csv** - the training set, contains historical data and returns

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', nrows=10000)
train.head()

Let's check targets distribution

In [None]:
plot_list = ['weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']
fig = make_subplots(rows=3, cols=2)

traces = [
    go.Histogram(
        x=train[col], 
        nbinsx=100, 
        name=col
    ) for col in plot_list
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2) + 1
    )

fig.update_layout(
    title_text='Target features distributions',
    height=900,
    width=800
)

fig.show()

And now let's see all features distributions

In [None]:
features = train.columns
features = features[7:]
features = features[:130]

In [None]:
fig = make_subplots(
    rows=44, 
    cols=3
)

traces = [
    go.Histogram(
        x=train[col], 
        nbinsx=100, 
        name=col
    ) for col in features
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 3) + 1, 
        (i % 3) + 1
    )

fig.update_layout(
    title_text='Train features distributions',
    height=5000
)

fig.show()

In [None]:
cols = features

<a id="2"></a>
<h2 style='background:black; border:0; color:white'><center>2. Features description</center><h2>

**features.csv** - metadata pertaining to the anonymized features

In [None]:
features = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv')
features

<a id="3"></a>
<h2 style='background:black; border:0; color:white'><center>3. Test example description</center><h2>

**example_test.csv** - a mock test set which represents the structure of the unseen test set. You will not be directly using the test set or sample submission in this competition, as the time-series API will get/set the test set and predictions.

In [None]:
example_test = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_test.csv')
example_test

<a id="4"></a>
<h2 style='background:black; border:0; color:white'><center>4. Sample submission example</center><h2>

**example_sample_submission.csv** - a mock sample submission file in the correct format

In [None]:
submission = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_sample_submission.csv')
submission

<a id="5"></a>
<h2 style='background:black; border:0; color:white'><center>5. Modeling</center><h2>

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

Time to find pairs of features with high correlation.

In [None]:
%%time

all_columns = []
for i in range(0, len(cols)):
    for j in range(i+1, len(cols)):
        if abs(train[cols[i]].corr(train[cols[j]])) > 0.95:
            all_columns = all_columns + [cols[i], cols[j]]

In [None]:
all_columns = list(set(all_columns))
print('Number of columns:', len(all_columns))

In [None]:
data = train[all_columns]

f = plt.figure(
    figsize=(22, 22)
)

plt.matshow(
    data.corr(), 
    fignum=f.number
)

plt.xticks(
    range(data.shape[1]), 
    data.columns, 
    fontsize=14, 
    rotation=90
)

plt.yticks(
    range(data.shape[1]), 
    data.columns, 
    fontsize=14
)

cb = plt.colorbar()
cb.ax.tick_params(
    labelsize=14
)

Let's check correlation for target variables

In [None]:
data = train[['weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']]

f = plt.figure(
    figsize=(12, 12)
)

plt.matshow(
    data.corr(), 
    fignum=f.number
)

plt.xticks(
    range(data.shape[1]), 
    data.columns, 
    fontsize=14, 
    rotation=90
)

plt.yticks(
    range(data.shape[1]), 
    data.columns, 
    fontsize=14
)

cb = plt.colorbar()
cb.ax.tick_params(
    labelsize=14
)

In [None]:
env = janestreet.make_env()
iter_test = env.iter_test()

In [None]:
train = train[train['weight'] != 0]
train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')

In [None]:
X_train = train.loc[:, train.columns.str.contains('feature')]
y_train = train.loc[:, 'action']

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2)

In [None]:
del train

In [None]:
X_train = X_train.fillna(-999)
#X_test = X_test.fillna(-999)

In [None]:
sampler = TPESampler(seed=666)

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 12)
    n_estimators = trial.suggest_int("n_estimators", 2, 600)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    subsample = trial.suggest_uniform('subsample', 0.0001, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0000001, 1)
    
    model = XGBClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=666,
        tree_method='gpu_hist'
    )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    score = accuracy_score(
        y_train, 
        model.predict(X_train)
    )
    return score

# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=70)
# params = study.best_params
# params['random_state'] = 666
# params['tree_method'] = 'gpu_hist'

params1 = {
    'max_depth': 8, 
    'n_estimators': 500, 
    'learning_rate': 0.01, 
    'subsample': 0.9, 
    'tree_method': 'gpu_hist',
    'random_state': 666
}

# params2 = {
#     'max_depth': 9, 
#     'n_estimators': 500, 
#     'learning_rate': 0.03, 
#     'subsample': 0.9, 
#     'tree_method': 'gpu_hist',
#     'random_state': 666
# }

params3 = {
    'max_depth': 10, 
    'n_estimators': 500, 
    'learning_rate': 0.03, 
    'subsample': 0.9, 
    'colsample_bytree': 0.7,
    'tree_method': 'gpu_hist',
    'random_state': 666
}

In [None]:
model1 = XGBClassifier(**params1)
model1.fit(X_train, y_train)

# model2 = XGBClassifier(**params2)
# model2.fit(X_train, y_train)

model3 = XGBClassifier(**params3)
model3.fit(X_train, y_train)

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    if test_df['weight'].item() > 0:
        X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
        X_test = X_test.fillna(-999)
        # y_preds = model1.predict(X_test) + model2.predict(X_test) + model3.predict(X_test)
        y_preds = model1.predict(X_test) + model3.predict(X_test)
        if y_preds == 2:
            y_preds = np.array([1])
        else:
            y_preds = np.array([0])
    else:
        y_preds = np.array([0])
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)