In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

import datatable as dt
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Data Loading

In [None]:
%%time
# the fastest way to load big data I ever know
folder_path = '../input/jane-street-market-prediction/'
data = dt.fread(folder_path + 'train.csv').to_pandas()
features = dt.fread(folder_path + 'features.csv').to_pandas()
sample = dt.fread(folder_path + 'example_sample_submission.csv').to_pandas()
example_test = dt.fread(folder_path + 'example_test.csv').to_pandas()

# Missing Values Handling

For the first time, drop em all!

In [None]:
len(data.dropna()) / len(data)

In [None]:
data = data.dropna()

# Getting action via maximizing utility score

In [None]:
def get_action(resp_threshold = 0):
    action = pd.Series(index=data.index)
    # getting action mechanism
    positive_index = data[data["resp"] > resp_threshold].index
    negative_index = data.drop(index=positive_index, axis=1).index
    action[positive_index] = 1
    action[negative_index] = 0
    return action

action = get_action()

# An Echo of EDA

There are a lot of EDA attemts at the competition notebooks, so I will not bother myself too much with this.

In [None]:
date_freq = data["date"].value_counts()
date_freq.sort_index()
date_freq = pd.DataFrame({"Date" : date_freq.index, "ticks" : date_freq.values})
ax = sns.barplot(x="Date", y="ticks", data=date_freq)
ticks = np.linspace(0, 500, 11)
ticks = [int(i) for i in ticks]
# plt.setp(ax, xticks=ticks, xticklabels=ticks)
ax.set_xticks(ticks) 
ax.set_xticklabels(ticks)


# Predictive Modelling

In [None]:
train = data[data["date"] < 450]
test = data[data["date"] >= 450]

X_train = train.drop(["weight", "resp", "resp_1", "resp_2", "resp_3", "resp_4", "date", "ts_id"], axis=1).sample(frac=1)
X_test = test.drop(["weight", "resp", "resp_1", "resp_2", "resp_3", "resp_4", "date", "ts_id"], axis=1).sample(frac=1)

y_train = action[X_train.index]
y_test = action[X_test.index]

del train
del test

### non-optimized catboost
GPU reduces training time dramatically

In [None]:
from catboost import CatBoostClassifier

In [None]:
# %%time
# clf = CatBoostClassifier(iterations=1000,
#                            task_type="GPU",
#                            devices='0:1')
# clf.fit(X_train, y_train, verbose=False)

In [None]:
from sklearn.metrics import plot_roc_curve
# plot_roc_curve(clf, X_train, y_train)
# plt.show()

In [None]:
from sklearn.metrics import plot_roc_curve
# proba = clf.predict_proba(X_test)[:, 1]
# plot_roc_curve(clf, X_test, y_test)
# plt.show()

AUC score is really low, both on test and train. Could this be a problem of non-optimized classificator?

### catboos optimized

This time I will use small part of the dataset as valid, which allows significantly reducing optimization time. 

In [None]:
X_valid = X_train.sample(frac=0.1)
y_valid = action[X_valid.index]

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

# from sklearn.model_selection import StratifiedKFold
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
from time import time

In [None]:
clf = CatBoostClassifier(task_type="GPU",
                         devices='0:1')

Before optimization, it is worth to look at the valid training time.

In [None]:
# %%time
# clf.fit(X_valid, y_valid)

In [None]:
# # Defining your search space
# search_spaces = {'iterations': Integer(10, 1000),
#                  'depth': Integer(1, 8),
#                  'learning_rate': Real(0.01, 1.0, 'log-uniform'),
#                  'random_strength': Real(1e-9, 10, 'log-uniform'),
#                  'bagging_temperature': Real(0.0, 1.0),
#                  'border_count': Integer(1, 255),
#                  'l2_leaf_reg': Integer(2, 30),
#                  'scale_pos_weight':Real(0.01, 1.0, 'uniform')}

In [None]:
# # Setting up BayesSearchCV
# opt = BayesSearchCV(clf,
#                     search_spaces,
#                     scoring=roc_auc,
# #                     cv=skf,
#                     n_iter=100,
#                     n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
#                     return_train_score=False,
#                     refit=True,
#                     optimizer_kwargs={'base_estimator': 'GP'},
#                     random_state=42)

In [None]:
# %%time
# opt.fit(X_valid, y_valid)

In [None]:
# best_score = opt.best_score_
# # best_score_std = d.iloc[optimizer.best_index_].std_test_score
# best_params = opt.best_params_

In [None]:
# best_params

In [None]:
best_params = dict([('bagging_temperature', 0.0),
             ('border_count', 197),
             ('depth', 8),
             ('iterations', 1000),
             ('l2_leaf_reg', 2),
             ('learning_rate', 0.05120988491241486),
             ('random_strength', 1.338569339104125e-05),
             ('scale_pos_weight', 0.763457985922633)])

In [None]:
# best_score

In [None]:
best_clf = CatBoostClassifier(**best_params,
                              task_type="GPU", devices='0:1')
best_clf.fit(X_train, y_train)

In [None]:
plot_roc_curve(best_clf, X_train, y_train)
plt.show()

In [None]:
plot_roc_curve(best_clf, X_test, y_test)
plt.show()

Increasing of train ROC-AUC score caused mostly by higher max depth. Althrough HP optimization, there are no any changes in test performance. 
This may be caused by neglection of temporary data structure.

# Submission

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
%%time
for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df.action = best_clf.predict(test_df[X_train.columns]).astype(int)
    env.predict(sample_prediction_df)