In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import catboost
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import torch

import janestreet

import warnings
warnings.filterwarnings('ignore')

# Load data

In [None]:
train_df = dt.fread('../input/jane-street-market-prediction/train.csv').to_pandas()
train_df.info()

# Understand data

* How many actions are considered per day?

In [None]:
print(f'On average, there are {train_df.shape[0] / len(train_df.date.unique())} actions are considered per day.')

* Handling NaN values

In [None]:
# Number of NaN values in each columns
features = ['feature_' + str(i) for i in range(130)]
resp = ['resp_' + str(i) for i in range(1,5)]
nan_values_count = train_df[['weight'] + resp + features].isna().sum()
print(nan_values_count)
print('Percentage of NaN values:', nan_values_count.sum() / np.product(train_df.shape) * 100)

In [None]:
r'''# Filling NaN values by moving average of 17 periods ahead
for feat in tqdm(features):
    train_df[feat] = train_df[feat].fillna(train_df[feat].rolling(17, min_periods = 1).mean())'''

In [None]:
r'''for feat in tqdm(features):
    train_df[feat] = train_df[feat].rolling(17, min_periods = 1).mean()'''

* For those NaN values remained, replace by -999

In [None]:
train_df.fillna(-999, inplace = True)

* Data balancedness

In [None]:
train_df = train_df[train_df['weight'] != 0]
train_df['action'] = ((train_df['weight'].values * train_df['resp'].values) > 0).astype('int')

In [None]:
# Define the target variable
num_label = {}
for i in train_df.action.unique():
    num_label[i] = [sum(train_df.action == i), sum(train_df.action == i)/train_df.shape[0] * 100]
print(num_label)

* Quite good balancedness

# Modelling

* Check the distribution of types of actions over days, this will affect how we split the data into training and validation sets

In [None]:
# Splitting
ratio = 0.2
cutting_point = int(train_df.shape[0] * ratio)

train = train_df.iloc[:cutting_point,:]
val = train_df.iloc[cutting_point:,:]

In [None]:
train_action = train.groupby(['date', 'action']).size()
# In the training set
print('The number of 1 in the training set is:')
print(train_action[train_action.index.get_level_values('action') == 1].sum())
print('The number of 0 in the training set is:')
print(train_action[train_action.index.get_level_values('action') == 0].sum())

In [None]:
val_action = val.groupby(['date', 'action']).size()
# In the validation set
print('The number of 1 in the validation set is:')
print(val_action[val_action.index.get_level_values('action') == 1].sum())
print('The number of 0 in the validation set is:')
print(val_action[val_action.index.get_level_values('action') == 0].sum())

* Modelling

In [None]:
params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'grow_policy': 'Lossguide',
    'iterations': 2000,
    'learning_rate': 0.05,
    'random_seed': 0,
    'l2_leaf_reg': 50,
    'depth': 10,
    'max_leaves': 10,
    'border_count': 128,
    'verbose': 50,
}

In [None]:
from catboost import CatBoostClassifier, Pool

# Training and validating data
train_set = Pool(train[features], label = train['action'])
val_set = Pool(val[features], label = val['action'])

In [None]:
# Model definition
model = CatBoostClassifier(**params)

# Fitting
model.fit(train_set, eval_set = val_set, use_best_model = True)

# Inference

In [None]:
try:
    env = janestreet.make_env()
except:
    pass
iter_test = env.iter_test()
prior_test_df = None

In [None]:
%%time
for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df[features]
    # For the rest, fill with -999
    X_test.fillna(-999, inplace = True)
    y_preds = model.predict(X_test)
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)