In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)
import os
import warnings
from functools import partial
warnings.filterwarnings("ignore")

import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

import pandas as pd
from sklearn import preprocessing
import gc
import catboost as cat
import numpy as np

In [None]:
%time
train = pd.read_pickle('/kaggle/input/janestreet/jane_street_train.pkl.gzip')
print ("Data is loaded!")

print('train shape is {}'.format(train.shape))

train = train[train['weight'] != 0]
train['action'] = (train['resp'] > 0).astype('int')

X_train = train.loc[:, train.columns.str.contains('feature')]
y_train = train.loc[:, 'action']
groups = train['date'].values
weightes = train['weight'].values
resps = train['resp'].values

del train
gc.collect()

several seconds to load by saving original dataset to pickle file

In [None]:
def group_time_split(groups: np.ndarray, splits, X):
    group_list = np.unique(groups)
    n_samples = len(X)
    n_groups = len(group_list)
    indices = np.arange(n_samples)
    test_size = n_groups // splits
    test_starts = range(test_size + n_groups % splits, n_groups, test_size)
    test_starts = list(test_starts)
    for test_start in test_starts:
        yield (
            indices[np.isin(groups, group_list[:test_start])],
            indices[np.isin(groups, group_list[test_start:])],
        )


def utility_score_last(date, weight, resp, action):
    count_i = date[-1] + 1
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u


def get_opt_threshold(pred, y_true, groups, weights, bins=50):
    p_min, p_max = pred.min(), pred.max()
    opt_u = 0
    opt_th = p_min
    for th in np.linspace(p_min, p_max, endpoint=False, num=bins):
        act = np.where(pred > th, 1, 0)
        u = utility_score_last(groups, weights, y_true, act)
        if u > opt_u:
            opt_u = u
            print(u)
            opt_th = th
    return opt_th


def optimize(params, X_train: pd.DataFrame, y_train:pd.DataFrame, groups: np.ndarray, splits=5):

    print(params)
    p = {'learning_rate': params['learning_rate'],
         'max_depth': params['max_depth'],
         'reg_alpha': params['reg_alpha'],
         'subsample': params['subsample'],
         'colsample_bytree': params['colsample_bytree'],
         'boosting_type': 'gbdt',
         'random_state': 2020,
         'subsample_freq': 1,
         'num_leaves': 2 ** params['max_depth'] - 1,
         'metric': 'None'
         }

    weighted_auc = 0
    div = splits * (splits - 1) / 2

    for i, (tr_idx, val_idx) in enumerate(group_time_split(groups, splits, X_train)):
        tr_X = X_train.iloc[tr_idx]
        tr_y = y_train.iloc[tr_idx]
        val_X = X_train.iloc[val_idx]
        val_y = y_train.iloc[val_idx]

        clf = lgb.LGBMClassifier(**p)
        clf.fit(tr_X, tr_y, eval_set=[(val_X, val_y)], eval_metric='auc',
                early_stopping_rounds=20, verbose=False)

        val_pred = clf.predict_proba(val_X)[:, 1]

        weighted_auc += (i + 1) * roc_auc_score(val_y, val_pred) / div

        gc.collect()

    return - weighted_auc

In [None]:
splits = 4

base_params = {'colsample_bylevel': 0.601104234749781, 'l2_leaf_reg': 6.201131333667228, 
               'learning_rate': 0.033455321575393125, 'max_depth': 10.0, 'subsample': 0.8051610581570489,
              'random_state': 2020, 'eval_metric': 'AUC'}


# avg_best_iterations = 0
models = []
opt_thresholds = []


for i, (tr_idx, val_idx) in enumerate(group_time_split(groups, splits, X_train)):
    print("#"*50)
    print("fold:", i)
    if i < 2:
        continue
    tr_X = X_train.iloc[tr_idx]
    tr_y = y_train.iloc[tr_idx]
#     tr_group = groups[tr_idx]
#     tr_weight = weightes[tr_idx]
#     tr_resp = resps[tr_idx]
    
    val_X = X_train.iloc[val_idx]
    val_y = y_train.iloc[val_idx]
    val_group = groups[val_idx]
    val_weight = weightes[val_idx]
    val_resp = resps[val_idx]
    
    Tr = cat.Pool(tr_X, tr_y)
    val = cat.Pool(val_X, val_y)
    
    clf = cat.CatBoostClassifier(**base_params)
    clf.fit(Tr, eval_set=val,
            early_stopping_rounds=20, verbose=False)

    val_pred = clf.predict_proba(val_X)[:, 1]
    
    models.append(clf)
    val_pred = clf.predict_proba(val_X)[:, 1]
    
    opt_th = get_opt_threshold(val_pred, val_resp, val_group, val_weight)
    opt_thresholds.append(opt_th)
    
    gc.collect()

del tr_X, tr_y, val_X, val_y, val_pred, val_group, val_weight, val_resp
gc.collect()

In [None]:
base_params['n_estimators'] = models[-1].best_iteration_ + 5
clf = cat.CatBoostClassifier(**base_params)

clf.fit(X_train, y_train)
    
del X_train, y_train
gc.collect()

In [None]:
# weighted_opt = (np.arange(1,5) * opt_thresholds).sum() / 10
# weighted_opt = np.min(opt_thresholds)
weighted_opt = 0.5
model = models[-1]  # only consider the last two because of the limited time

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    if test_df['weight'].item() > 0:
        X_test = test_df.loc[:, test_df.columns.str.contains('feature')]

        y_preds = (2 * clf.predict_proba(X_test)[:, 1] + model.predict_proba(X_test)[:, 1]) / 3
        y_preds = np.where(y_preds>weighted_opt, 1, 0)
        sample_prediction_df.action = y_preds
    else:
        sample_prediction_df.action = 0
    env.predict(sample_prediction_df)