In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from xgboost import XGBClassifier
import xgboost as xgb

import joblib

In [None]:
SEED = 1111

#tf.random.set_seed(SEED)
np.random.seed(SEED)

import datatable

datatable_frame = datatable.fread('../input/jane-street-market-prediction/train.csv')
#df_raw = pd.read_csv('../input/jane-street-market-prediction/train.csv')
df_raw = datatable_frame.to_pandas()

df_raw = df_raw.query('date > 85').reset_index(drop = True) 
df_raw = df_raw[df_raw['weight'] != 0]


df_raw['action'] = ((df_raw['resp'].values) > 0).astype(int)

In [None]:
LOCAL_TRAIN_TEST_SPLIT = False
LOCAL_TEST = False

In [None]:
if LOCAL_TRAIN_TEST_SPLIT:
    df_train, df_test = train_test_split(df_raw, test_size=0.2, shuffle=True, random_state=150)
else:
    df_train = df_raw

In [None]:
features = [c for c in df_train.columns if "feature" in c]

f_median = df_train[features].median(axis=0)

neutral_values = f_median

df_train.fillna(neutral_values,inplace=True)

In [None]:
f_median.to_csv('median_pd_130_features.csv')

In [None]:
resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']

X_train = df_train.loc[:, df_train.columns.str.contains('feature')]

y_train = np.stack([(df_train[c] > 0).astype('int') for c in resp_cols]).T

y_action_train = df_train['action'].to_numpy()

In [None]:
if LOCAL_TRAIN_TEST_SPLIT:
    df_test.fillna(neutral_values,inplace=True)

    X_test = df_test.loc[:, df_test.columns.str.contains('feature')]

    y_test = np.stack([(df_test[c] > 0).astype('int') for c in resp_cols]).T

    y_action_test = df_test['action'].to_numpy()

XGBoost

In [None]:
xgb_clfs = []
for i in range(len(resp_cols)):
    print('classifier', i, 'is training')
    xgb_clf = (xgb.XGBClassifier(n_estimators=500, 
                                 max_depth=8, 
                                 subsample=0.9,
                                 learning_rate=0.05,
                                 objective='binary:logistic', 
                                 tree_method='gpu_hist'))
    xgb_clf.fit(X_train, y_train[:,i])
    xgb_clfs.append(xgb_clf)

In [None]:
for i in range(len(resp_cols)):
    joblib.dump(xgb_clfs[i], "xgb" + str(i) + "-n-500-d-8-sub-0.9-lr-0.05.joblib")

In [None]:
if LOCAL_TEST:
    five_preds = []

    for i in range(len(resp_cols)):
        pred_prob = xgb_clfs[i].predict_proba(X_test)[:,1]    # arr[0] is the probability for class 0, arr[1] is the probability for class 1
        five_preds.append(pred_prob)
    five_preds = np.array(five_preds).T
        
    
    th = 0.5

    f_get_action = np.median
    preds = f_get_action(five_preds, axis=1)
    actions_predicted = np.where(preds >= th, 1, 0).astype(int)

    print(preds.shape)
    print(actions_predicted.shape)

    print(metrics.accuracy_score(y_action_test, actions_predicted))

In [None]:
if not LOCAL_TEST:
    f_get_action= np.median

    th=0.5

    models = xgb_clfs
    import janestreet
    from tqdm import tqdm
    env = janestreet.make_env()
    for (test_df, pred_df) in tqdm(env.iter_test()):
        if test_df['weight'].item() > 0:
            x_tt = test_df.loc[:, features].values
            if np.isnan(x_tt.sum()):
                #x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
                x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt) * neutral_values.values
            five_preds = []

            for i in range(len(resp_cols)):
                pred_prob = xgb_clfs[i].predict_proba(x_tt)[:,1]    # arr[0] is the probability for class 0, arr[1] is the probability for class 1
                five_preds.append(pred_prob)
            five_preds = np.array(five_preds).T

            f_get_action = np.median
            preds = f_get_action(five_preds, axis=1)
            actions_predicted = np.where(preds >= th, 1, 0).astype(int)
            pred_df.action = actions_predicted
        else:
            pred_df.action = 0
        env.predict(pred_df)