<center><img src="https://consulting.brown.edu/images/jane_str.png"></center>

In [None]:
import os
import warnings
import optuna
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 500)

In [None]:
SEED = 2809

<a id="1"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>1. Data Loading And Preprocessing<center><h2>

In [None]:
%%time
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
features = pd.read_csv('../input/jane-street-market-prediction/features.csv')
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')
print ("Data is loaded!")

In [None]:
train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')


X_train = train.loc[:, train.columns.str.contains('feature')]
X_train = X_train.fillna(-999)
y_train = train.loc[:, 'action']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=SEED)

<a id="1"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>2. Architecture sugar<center><h2>

In [None]:
class XGBoostCLFCreator:
    def __init__(self, missing:int, tree_method:str, random_state=None):
        self.__missing = missing
        self.__tree_method = tree_method
        self.__random_state = random_state
    
    def __call__(self, n_estimators, max_depth, learning_rate, subsample, colsample_bytree):
        return xgb.XGBClassifier(
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    learning_rate=learning_rate,
                    subsample=subsample,
                    colsample_bytree=colsample_bytree,
                    missing=self.__missing,
                    random_state=self.__random_state,
                    tree_method=self.__tree_method
                )
    
class XGBoostTrialCreator:
    def __init__(self, 
                 n_estimators_range=(100, 1000), 
                 max_depth_range=(3, 17), 
                 learning_rate_range=(1e-3, 1e-1),
                 subsample_range=(1e-1, 99e-2),
                 colsample_bytree_range=(1e-1, 99e-2),
                 clf_creator = None
                ):
        self.__n_estimators_range = n_estimators_range
        self.__max_depth_range = max_depth_range
        self.__learning_rate_range = learning_rate_range
        self.__subsample_range = subsample_range
        self.__colsample_bytree_range = colsample_bytree_range
        self.__clf_creator = clf_creator or XGBoostCLFCreator(-999, 'gpu_hist', SEED)
        
    def __call__(self, trial: optuna.Trial):
        return self.__clf_creator(
            n_estimators=trial.suggest_int('n_estimators', self.__n_estimators_range[0], self.__n_estimators_range[1]),
            max_depth=trial.suggest_int('max_depth', self.__max_depth_range[0], self.__max_depth_range[1]),
            learning_rate=trial.suggest_uniform('learning_rate', self.__learning_rate_range[0], self.__learning_rate_range[1]),
            subsample=trial.suggest_uniform('subsample', self.__subsample_range[0], self.__subsample_range[1]),
            colsample_bytree=trial.suggest_uniform('colsample_bytree', self.__colsample_bytree_range[0], self.__colsample_bytree_range[1])
        )
    
    def create_estimator(self, **kwargs):
        return self.__clf_creator(**kwargs)
        
class MetricCreator:
    def __init__(self, 
                 X_train, 
                 X_valid, 
                 y_train, 
                 y_valid, 
                 base_metric = None,
                 clf_trial_creator = None
                ):
        self.__base_metric = base_metric or roc_auc_score
        self.__X_train = X_train
        self.__X_valid = X_valid
        self.__y_train = y_train
        self.__y_valid = y_valid
        self.__clf_trial_creator = XGBoostTrialCreator()
        
    def __call__(self, trial: optuna.Trial):
        clf = self.__clf_trial_creator(trial)
        clf.fit(self.__X_train, self.__y_train)
        y_pred = clf.predict(self.__X_valid)
        return self.__base_metric(self.__y_valid, y_pred)
    
    def create_estimator(self, **kwargs):
        return self.__clf_trial_creator.create_estimator(**kwargs)

<a id="1"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>3. XGBClassifier parameters optimize<center><h2>

In [None]:
study = optuna.create_study(direction='maximize')
metric = MetricCreator(X_train, X_valid, y_train, y_valid)
study.optimize(metric, n_trials=20)

<a id="1"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>4. Optuna visialization<center><h2>

In [None]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show();

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show();

In [None]:
fig = optuna.visualization.plot_edf([study])
fig.show();

<a id="1"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>5. Predict<center><h2>

In [None]:
study.best_params

In [None]:
clf = metric.create_estimator(**study.best_params)
clf.fit(X_train, y_train)

In [None]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
    X_test.fillna(-999)
    y_preds = clf.predict(X_test)
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)