# Kaggle: Jane Streem Market Prediction
https://www.kaggle.com/c/jane-street-market-prediction/

# Keras autoencoder + CatBoost v05_1

In [None]:
#!pip install optuna

In [None]:
import gc
import numpy as np 
import pandas as pd
from tqdm.notebook import tqdm
import joblib
pd.set_option('display.max_columns', 200)

from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.layers import Lambda, Concatenate, Input, GaussianNoise
from tensorflow.keras import utils, initializers, optimizers, regularizers
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.regularizers import L1L2
import tensorflow.keras.backend as K
import tensorflow as tf

from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss
#from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

#import optuna
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import catboost
#import lightgbm
import sklearn

print('Scikit-learn ver:', sklearn.__version__)
print('Keras ver:', tf.keras.__version__)
print('CatBoost ver:', catboost.__version__)
#print('LightGBM ver:', lightgbm.__version__)
#print('Optuna ver:', optuna.__version__)

In [None]:
# Constant
LOCAL_MODE = False # load data from file train.parquet, else kaggle datastore
IS_TRAIN_ENCODER = False
IS_GRID = False # Serching best parameters in boosters
IS_TRAIN = False # load model and only predict, else train, predict
IS_SUBMIT = True # commit or not
IS_ADD_FEATURES = False # add new features
TEST_SIZE = 0.1 # data test size 
SIZE_TRAIN = None #400_000 # if None all rows in train
COL_FEATURES = [f'feature_{x}' for x in range(130)]
COL_TARGET = ['target']
COL_SCORE = ['date', 'weight', 'resp']
NAN_VALUE = 'mean' # if None dropna else fillna, if 'mean' mean of col
PATH = '../input/jsm-v05-1/' # path for non local
gc.collect()

In [None]:
%%time
try:
    import cudf
    train_cudf  = cudf.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
    train = train_cudf.to_pandas()
    del train_cudf
    gc.collect()
except:
    if LOCAL_MODE: # for local notebook
        train = pd.read_parquet('kaggle/input/jane-street-market-prediction/train.parquet') 
    else:
        train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
if SIZE_TRAIN:
    train = train[-SIZE_TRAIN:]
# drop day<86, in forum say JSM changed the strategy after 86 days
train = train.query('date > 85').reset_index(drop=True)
gc.collect()
train['target'] = (train.loc[:,'resp']>0).astype('int8') # set target field
train.loc[train['weight']<=0, 'weight'] = np.nan # only weight>0
train = train.astype({x: np.float32 for x in train.select_dtypes(include='float64').columns})
train = train.astype({x: np.int32 for x in train.select_dtypes(include='int64').columns}) 
train = train[COL_FEATURES + COL_TARGET + COL_SCORE] # trim the extra columns
if NAN_VALUE: # fillna value or fillna mean values of features
    if NAN_VALUE == 'mean':
        F_MEAN = train[COL_FEATURES].mean()
        for col in F_MEAN.index:
            val_mean = F_MEAN.loc[col]
            train[col] = train[col].fillna(val_mean)
    else:
        train[COL_TARGET] = train[COL_FEATURES].fillna(NAN_VALUE)
train.dropna(inplace=True)
gc.collect()
print(train.info())

In [None]:
if IS_ADD_FEATURES:
    tags = pd.read_csv('../input/jane-street-market-prediction/features.csv')
    tag_f = tag_f = {x: list(tags['feature'][tags[x]]) for x in tags.columns[1:]}
    for col in tags.columns[1:]:
        train[col] = train[tag_f[col]].sum(axis=1).astype('float32')
    COL_FEATURES += list(tags.columns[1:])
    #COL_FEATURES = list(tags.columns[1:])

In [None]:
%%time
# Split train, test
col_score = ['date', 'weight', 'resp']
pos_test = int(len(train)*TEST_SIZE)
train = train.sample(frac=1).copy() # shuffle train, access - eval(x_tr)...
gc.collect() # optimize RAM
x_tr = "train[COL_FEATURES].iloc[:-pos_test]"
y_tr = "train[COL_TARGET].iloc[:-pos_test]"
x_te = "train[COL_FEATURES].iloc[-pos_test:]"
y_te = "train[COL_TARGET].iloc[-pos_test:]"
print('Start training: train:', eval(x_tr).shape, eval(y_tr).shape, 
      'test:', eval(x_te).shape, eval(y_te).shape)
#print('Distribution of target:')
#print(eval(y_te).value_counts(sort=False, normalize=True))

In [None]:
from numba import njit
# Score for Jane Street Market
@njit(fastmath=True)
def jsm_score(date, weight, resp, action):
    Pi = np.bincount(date, weight * resp * action)
    count_i = len(Pi)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = min(max(t, 0), 6) * np.sum(Pi)
    return u

In [None]:
max_score_test = jsm_score(
    train['date'].loc[eval(y_te).index].values,
    train['weight'].loc[eval(y_te).index].values,
    train['resp'].loc[eval(y_te).index].values,
    train['target'].loc[eval(y_te).index].values,
    #((train[COL_TARGET]>0.000001).loc[eval(y_te).index].sum(axis=1) == 5).astype(int).values
    #y_test['target'].values,
)
max_score_train = jsm_score(
    train['date'].loc[eval(y_tr).index].values,
    train['weight'].loc[eval(y_tr).index].values,
    train['resp'].loc[eval(y_tr).index].values,
    train['target'].loc[eval(y_tr).index].values,
    #((train[COL_TARGET]>0.000001).loc[eval(y_tr).index].sum(axis=1) == 5).astype(int).values
)
print('Max score in train: {:_.0f}'.format(max_score_train))
print('Max score in test: {:_.0f}'.format(max_score_test))

## Keras autoencoder

In [None]:
def create_models_ae(input_shape, latent_dim=16, lambda_l1=0, noise=0.05):
    input_dim = np.prod(input_shape) # if input is multidim
    #print(input_dim)
    activation = 'swish'
    encoder_in = Input(input_shape)
    #x = Flatten()(encoder_in) # dense for multidim
    #x = BatchNormalization()(encoder_in)
    x = GaussianNoise(noise)(encoder_in)
    x = Dense(128, activation = activation)(x)
    #x = Dropout(0.2)(x)
    x = Dense(128, activation = activation)(x)
    #x = Dropout(0.2)(x)
    z = Dense(latent_dim, activation='sigmoid', 
             activity_regularizer=L1L2(lambda_l1, 0))(x) # dense of PCA
    encoder = Model(encoder_in, z, name='encoder')
    #encoder.summary()
    decoder_in = Input((latent_dim,))
    x = Dense(128, activation = activation)(decoder_in)
    #x = Dropout(0.2)(x)
    x = Dense(128, activation = activation)(x)
    output = Dense(input_dim, activation='linear')(x)
    decoder = Model(decoder_in, output, name='decoder')
    #decoder.summary()
    ae_output = decoder(encoder(encoder_in))
    autoencoder = Model(encoder_in, ae_output, name='autoencoder')
    autoencoder.compile(optimizer='adam', loss='mse')
    return encoder, decoder, autoencoder

In [None]:
encoder, decoder, autoencoder = create_models_ae(eval(x_tr).shape[1:], latent_dim=64, lambda_l1=1e-4, noise=0.1)
autoencoder.summary()

In [None]:
#encoder.summary()

In [None]:
gc.collect()
if IS_TRAIN_ENCODER:
    autoencoder.fit(eval(x_tr), eval(x_tr),
                    epochs=1010,
                    batch_size = 16384,
                    validation_data = (eval(x_te), eval(x_te)),
                    callbacks=[EarlyStopping('val_loss', patience=10, restore_best_weights=True),
                               ReduceLROnPlateau(monitor='loss', factor=0.7, patience=3, min_lr=0.000001, verbose=1)
                              ]
                   )
    encoder.save_weights('v05_encoder.hdf5')
else:
    if LOCAL_MODE:
        encoder.load_weights('v05_encoder.hdf5')
    else:
        encoder.load_weights(PATH+'v05_encoder.hdf5')

Epoch 154/1010
73/73 [==============================] - 2s 30ms/step - loss: 0.1507 - val_loss: 0.1830  
Epoch 427/1010
87/87 [==============================] - 2s 23ms/step - loss: 0.1133 - val_loss: 0.1174  



In [None]:
gc.collect()
np.mean(encoder(eval(x_te).iloc[0:1].values))
#0.42087412

# Predict after encoder

In [None]:
# reshuffle train, test
train = train.sample(frac=1).copy() # shuffle train, access - eval(x_tr)...
gc.collect() # optimize RAM

In [None]:
gc.collect()
def get_best_th(y_test, y_proba_test):
    best_score = 0
    best_th = 0
    th_list = [0.5] + [0.4 + x/1000 for x in range(200)]
    for my_th in tqdm(th_list, desc='Find best threshold'):
        acc_test = accuracy_score(eval(y_te), (y_proba_test>my_th).astype(int).argmax(axis=1))*100
        try:
            score_test = jsm_score(
                train['date'].loc[eval(y_te).index].values,
                train['weight'].loc[eval(y_te).index].values,
                train['resp'].loc[eval(y_te).index].values,
                (y_proba_test>my_th).astype(int).argmax(axis=1).reshape(-1)
            )        
        except:
            continue
        if (score_test >= best_score):
            best_score = score_test
            best_th = my_th
            print('Best score with TH: {:.3f}: acc_test: {:.2f}%, score_test: {:_.0f}'.format(
                my_th,                                                                                           
                acc_test, score_test))
    gc.collect()
    return best_th 

In [None]:
def get_model_report(y_test, y_proba_test):
    my_th = 0.5
    score_test = jsm_score(
        train['date'].loc[y_test.index].values,
        train['weight'].loc[y_test.index].values,
        train['resp'].loc[y_test.index].values,
        (y_proba_test>my_th).astype(int).argmax(axis=1).reshape(-1)
    )
    acc_test = accuracy_score(y_test, (y_proba_test>my_th).astype(int).argmax(axis=1))*100
    print('Report with TH: {:.3f}, acc_test: {:.2f}%, score_test: {:_.0f}'.format(
            my_th,                                                                                           
            acc_test, score_test))
    print(classification_report(y_test, (y_proba_test>my_th).astype(int).argmax(axis=1)))
    cm = confusion_matrix(y_test, (y_proba_test>my_th).astype(int).argmax(axis=1))
    conf_matrix = pd.DataFrame(data=cm, index=[0,1], columns=[0,1])
    plt.figure(figsize = (6, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="YlGnBu", cbar=False)
    plt.title('Confusion matrix')
    plt.tight_layout()
    plt.show()

## CatBoostClassifier()

In [None]:
# Search the best options
gc.collect()
if IS_GRID: 
    clf = CatBoostClassifier(
        iterations=100,
        learning_rate=0.05, 
        custom_loss = ['Accuracy'],
        loss_function='Logloss', #'Logloss', 'CrossEntropy',        
        eval_metric='AUC',
        random_seed=2020,
        task_type="GPU",
        # ---- param gridsearch
        #depth=6,
        l2_leaf_reg=5, # 1 - the best
        border_count=20,
        leaf_estimation_iterations=7,
    )
    grid = {
        'depth': [x for x in range(4,13)],
        #'learning_rate': [0.5, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001],
        #'l2_leaf_reg': [x for x in range(1,256)],
        #'border_count': [x for x in range(1,256)],
        #'leaf_estimation_iterations': [x for x in range(1,256)],        
        }
    grid_search_result = clf.grid_search(grid,
    #grid_search_result = clf.randomized_search(grid, n_iter=20,
        X=encoder.predict(train[COL_FEATURES]), y=train[COL_TARGET],
        plot=True, verbose=False,
        cv=3, partition_random_seed=3, stratified=True,
        #sample_weight=sample_weights[-200_000:],
    )

In [None]:
if IS_GRID:
    print('The best params')
    print(grid_search_result['params'])

In [None]:
gc.collect()
if IS_TRAIN:
    sample_weight = ((train['resp'].loc[eval(x_tr).index] * train['weight'].loc[eval(x_tr).index]))
    clf_cb = CatBoostClassifier(
        iterations=10_000, # 10_000
        loss_function='Logloss', #MultiClassOneVsAll, logloss
        task_type="GPU",
        #learning_rate=0.05,
        depth=10, l2_leaf_reg=5, border_count=254, leaf_estimation_iterations=7,
        use_best_model=True, random_seed=2020,
        #rsm=0.98,
        verbose=0,
    )#, save_snapshot=True, snapshot_file='snapshot_ml_v1.cbs')
    clf_cb.fit(encoder.predict(eval(x_tr)), eval(y_tr), 
            eval_set=(encoder.predict(eval(x_te)), eval(y_te)),
            sample_weight = np.abs(sample_weight),
            #plot=False, 
            plot=True,
            verbose=0, #100
           );
    clf_cb.save_model('v05_model_catboost.cbm')
else:
    clf_cb = CatBoostClassifier()
    if LOCAL_MODE:
        clf_cb.load_model('v05_model_catboost.cbm')
    else:
        clf_cb.load_model(PATH+'v05_model_catboost.cbm')

# **Select TH**

In [None]:
print('Max score in test: {:_.0f}'.format(max_score_test))
y_proba_test = clf_cb.predict_proba(encoder.predict(eval(x_te)))
get_model_report(eval(y_te), y_proba_test)
BEST_TH_CB = get_best_th(eval(y_te), y_proba_test)
gc.collect()

Best score with TH: 0.507: acc_test: 51.79%, score_test: 563  
Best score with TH: 0.502: acc_test: 54.45%, score_test: 2_639  
Best score with TH: 0.500: acc_test: 53.57%, score_test: 3_148 (depth=8, iter=10_000)  
Best score with TH: 0.504: acc_test: 53.77%, score_test: 3_142 (depth=11, iter=2_000)  
Best score with TH: 0.529: acc_test: 53.96%, score_test: 3_419 (depth=10, iter=10_000)  




In [None]:
BEST_TH_CB

# **Submit prediction**

In [None]:
%%time
if IS_SUBMIT:
    model_th = BEST_TH_CB
    if NAN_VALUE:
        if NAN_VALUE == 'mean':
            f_mean = F_MEAN.values.reshape(1,-1)
    import janestreet
    env = janestreet.make_env() # initialize the environment
    iter_test = env.iter_test() # an iterator which loops over the test set
    count = 0
    for (val_df, sample_prediction_df) in tqdm(iter_test):
        if val_df['weight'].iloc[0] > 0:
            x_val = val_df[COL_FEATURES].values
            if NAN_VALUE:
                if NAN_VALUE == 'mean':
                    x_val[:, :] = np.nan_to_num(x_val[:, :]) + np.isnan(x_val[:, :]) * f_mean
                else:    
                    x_val = np.nan_to_num(x_val, nan=NAN_VALUE, posinf=NAN_VALUE, neginf=NAN_VALUE)
                y_pred = clf_cb.predict_proba(encoder(x_val[:,:], training=False).numpy()).reshape(-1)
                sample_prediction_df.action = np.where(y_pred[1] > model_th, 1, 0).astype(int)
            else:
                if np.isnan(x_val[:, :].sum()):
                    sample_prediction_df.action = 0
                else:
                    y_pred = clf_cb.predict_proba(encoder(x_val[:,:], training=False).numpy()).reshape(-1)
                    sample_prediction_df.action = np.where(y_pred[1] > model_th, 1, 0).astype(int)
        else:
            sample_prediction_df.action = 0
        env.predict(sample_prediction_df)
        count += 1
    print('Submit done ({})'.format(count))

In [None]:
#env.predict(sample_prediction_df)