<center><h2>Jane Street Market Prediction | LGB with GroupKFold | katsu1110 </h2></center><hr>

![](https://optuna.org/assets/img/optuna-logo@2x.png)

This is my another attempt to get a good LGB model. I start over with a simple model;)

As a bonus, I save the tuned model in the [Treelite](https://treelite.readthedocs.io/en/latest/) format to accelerate the inference speed.

This notebook loads feathered-data from [my another notebook](https://www.kaggle.com/code1110/janestreet-save-as-feather?scriptVersionId=47635784) such that we don't have to spend our time on waiting long for loading csv files.

In this notebook we treat the task as a binary classification.

# Install Treelite

In [None]:
!pip --quiet install ../input/treelite/treelite-0.93-py3-none-manylinux2010_x86_64.whl

In [None]:
!pip --quiet install ../input/treelite/treelite_runtime-0.93-py3-none-manylinux2010_x86_64.whl

In [None]:
import numpy as np
import pandas as pd

import os, sys
import gc
import math
import random
import pathlib
from tqdm import tqdm
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn import metrics
import operator
import xgboost as xgb
import lightgbm as lgb
import optuna
from tqdm import tqdm_notebook as tqdm

# treelite
import treelite
import treelite_runtime 

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib_venn import venn2
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('fivethirtyeight')
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

# Config
Some configuration setups.

In [None]:
SEED = 20201225 # Merry Christmas!
NFOLD = 4
# INPUT_DIR = '../input/jane-street-market-prediction/'
INPUT_DIR = '../input/janestreet-save-as-feather/'
TRADING_THRESHOLD = 0.502 # 0 ~ 1: The smaller, the more aggressive
DATE_BEGIN = 0 # 0 ~ 499: set 0 for model training using the complete data 

# Load data
I have already saved the training data in the feather-format in [my another notebook](https://www.kaggle.com/code1110/janestreet-save-as-feather?scriptVersionId=47635784). Loading csv takes time but loading feather is really light:)

In [None]:
os.listdir(INPUT_DIR)

In [None]:
%%time

# load data blitz fast!
def load_data(input_dir=INPUT_DIR):
    train = pd.read_feather(pathlib.Path(input_dir + 'train.feather'))
    features = pd.read_feather(pathlib.Path(input_dir + 'features.feather'))
    example_test = pd.read_feather(pathlib.Path(input_dir + 'example_test.feather'))
    ss = pd.read_feather(pathlib.Path(input_dir + 'example_sample_submission.feather'))
    return train, features, example_test, ss

train, features, example_test, ss = load_data(INPUT_DIR)

In [None]:
# delete irrelevant files to save memory
del features, example_test, ss
gc.collect()

# Preprocess

In [None]:
# remove weight = 0 for saving memory 
original_size = train.shape[0]
train = train.query('weight > 0').reset_index(drop=True)

# use data later than DATE_BEGIN
train = train.query(f'date >= {DATE_BEGIN}')

print('Train size reduced from {:,} to {:,}.'.format(original_size, train.shape[0]))

In [None]:
# target
train['action'] = train['resp'] * train['weight']
train['action'] = 1 * (train['action'] > 0)

In [None]:
# features to use
feats = [f for f in train.columns.values.tolist() if f.startswith('feature')]
print('There are {:,} features.'.format(len(feats)))

# Fit with GroupKFold
I use GroupKFold with shuffle = True.

In [None]:
import random
from collections import Counter, defaultdict
from sklearn import model_selection

# ---- GroupKFold ----
class GroupKFold(object):
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = X[group].unique()
        for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_ids)):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(X[group].isin(tr_group))[0]
            val_idx = np.where(X[group].isin(va_group))[0]
            yield train_idx, val_idx

# ---- StratifiedGroupKFold ----
class StratifiedGroupKFold(object):
    """
    StratifiedGroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        labels_num = np.max(y) + 1
        y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
        y_distr = Counter()
        groups = X[group].values
        for label, g in zip(y, groups):
            y_counts_per_group[g][label] += 1
            y_distr[label] += 1

        y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
        groups_per_fold = defaultdict(set)

        def eval_y_counts_per_fold(y_counts, fold):
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(self.n_splits)])
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)
        
        groups_and_y_counts = list(y_counts_per_group.items())
        random.Random(self.random_state).shuffle(groups_and_y_counts)

        for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
            best_fold = None
            min_eval = None
            for i in range(self.n_splits):
                fold_eval = eval_y_counts_per_fold(y_counts, i)
                if min_eval is None or fold_eval < min_eval:
                    min_eval = fold_eval
                    best_fold = i
            y_counts_per_fold[best_fold] += y_counts
            groups_per_fold[best_fold].add(g)

        all_groups = set(groups)
        for i in range(self.n_splits):
            train_groups = all_groups - groups_per_fold[i]
            test_groups = groups_per_fold[i]

            train_idx = [i for i, g in enumerate(groups) if g in train_groups]
            test_idx = [i for i, g in enumerate(groups) if g in test_groups]

            yield train_idx, test_idx

In [None]:
# from https://www.kaggle.com/gogo827jz/jane-street-super-fast-utility-score-function/notebook
from numba import njit

@njit(fastmath = True)
def utility_score_numba(date, weight, resp, action):
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / len(Pi))
    u = min(max(t, 0), 6) * np.sum(Pi)
    return u

In [None]:
params = {
    'num_leaves': 256,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'max_depth': 7,
    'min_child_weight': 4,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 4,
    'min_child_samples': 16,
    'lambda_l1': 1,
    'lambda_l2': 1,
}

th = 0.5
cv = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=SEED)
group = 'date'
target = 'action'
oof = np.zeros(train.shape[0])
models = []
for fold, (train_idx, val_idx) in tqdm(enumerate(cv.split(train, train[target], group))):
    # train test split
    x_train, x_val = train[feats].iloc[train_idx], train[feats].iloc[val_idx]
    y_train, y_val = train[target].iloc[train_idx], train[target].iloc[val_idx]

    # model fitting
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_val, y_val)
    
    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], 
                      early_stopping_rounds=40, verbose_eval=1000)
    oof[val_idx] = model.predict(x_val)
    models.append(model)
    
    # save model
    model.save_model(f'model_fold{fold}.txt')
    
    # score
    date = train['date'].iloc[val_idx].values
    weight = train['weight'].iloc[val_idx].values
    resp = train['resp'].iloc[val_idx].values
    action = 1 * (oof[val_idx] > th)
    score = utility_score_numba(date, weight, resp, action)
    print(f'FOLD {fold}: score = {score}')

# CV score

In [None]:
# save oof
np.save('oof', oof)

In [None]:
# score
date = train['date'].values
weight = train['weight'].values
resp = train['resp'].values
action = 1 * (oof > th)
score = utility_score_numba(date, weight, resp, action)
print('CV score = {}'.format(score))

# Feature importance
Let's see feature importance given by the model.

In [None]:
lgb.plot_importance(model, importance_type="gain", figsize=(7, 40))

# Treelite
I believe Treelite is must in this competition, to avoid the sumission error due to the long inference time.

In [None]:
predictors = []
for fold in range(len(models)):
    # load LGB with Treelite
    model = treelite.Model.load(f'model_fold{fold}.txt', model_format='lightgbm')
    
    # generate shared library
    toolchain = 'gcc'
    model.export_lib(toolchain=toolchain, libpath=f'./mymodel{fold}.so',
                     params={'parallel_comp': 32}, verbose=True)# predictor from treelite

    # predictors
    predictor = treelite_runtime.Predictor(f'./mymodel{fold}.so', verbose=True)
    predictors.append(predictor)

# Submit
Let's use Treelite for faster inference.

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set
    
for (test_df, pred_df) in tqdm(iter_test):
    if test_df['weight'].item() > 0:
        # inference with treelite
        batch = treelite_runtime.Batch.from_npy2d(test_df[feats].values)
        pred = np.mean([predictor.predict(batch) for predictor in predictors], axis=0)
        pred_df.action = (pred > TRADING_THRESHOLD).astype('int')
    else:
        pred_df.action = 0
    env.predict(pred_df)

All done!