# Import 

In [63]:
# For Kaggle

# Standard Libraries
import sys
import os
import collections

# Data Science Libraries
import pandas as pd
import numpy as np
import optuna

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Statistics Libraries
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.graphics.api as smg

# ML Libraries
import sklearn as sk
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost
import catboost

# Project Libraries
import optiver2023

# Configure Visualization
%matplotlib inline
plt.style.use('bmh')

# Configure Pandas and SKLearn
pd.set_option("display.max_colwidth", 20)
pd.set_option("display.precision", 3)
sk.set_config(display="diagram")

# File Specific Configurations
DATA_DIR = "/kaggle/input/optiver-trading-at-the-close/"
plt.rcParams['figure.dpi'] = 270
START = pd.Timestamp.now()
SEED = 42

# 2. LightGBM Baseline

In [25]:
Xy_train = pd.read_csv(DATA_DIR + "train.csv")
Xy_train

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3.181e+06,1,1.000,1.338e+07,,,1.000,60651.50,1.000,8493.03,1.000,-3.030,0,0_0_0
1,1,0,0,1.666e+05,-1,1.000,1.642e+06,,,1.000,3233.04,1.001,20605.09,1.000,-5.520,0,0_0_1
2,2,0,0,3.029e+05,-1,1.000,1.819e+06,,,0.999,37956.00,1.000,18995.00,1.000,-8.390,0,0_0_2
3,3,0,0,1.192e+07,-1,1.000,1.839e+07,,,1.000,2324.90,1.000,479032.40,1.000,-4.010,0,0_0_3
4,4,0,0,4.475e+05,-1,1.000,1.786e+07,,,0.999,16485.54,1.000,434.10,1.000,-7.350,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2.441e+06,-1,1.000,2.828e+07,1.000,1.000,1.000,32257.04,1.000,319862.40,1.000,2.310,26454,480_540_195
5237976,196,480,540,3.495e+05,-1,1.001,9.188e+06,1.000,1.000,1.001,205108.40,1.001,93393.07,1.001,-8.220,26454,480_540_196
5237977,197,480,540,0.000e+00,0,0.996,1.273e+07,0.996,0.996,0.996,16790.66,0.996,180038.32,0.996,1.169,26454,480_540_197
5237978,198,480,540,1.001e+06,1,0.999,9.477e+07,0.999,0.999,0.999,125631.72,0.999,669893.00,0.999,-1.540,26454,480_540_198


In [45]:
# We split the data here, and use them in the following sections

X = Xy_train.query("target.notna()").drop(['row_id', "time_id"], axis=1)
y = X.pop("target")

In [46]:
y[-y_len:]

4190346    -0.090
4190347    11.621
4190348     0.581
4190349    -0.070
4190350     1.190
            ...  
5237975     2.310
5237976    -8.220
5237977     1.169
5237978    -1.540
5237979    -6.530
Name: target, Length: 1047578, dtype: float64

In [47]:
y[:-y_len],len(y)

(0          -3.03
 1          -5.52
 2          -8.39
 3          -4.01
 4          -7.35
            ...  
 4190341   -31.97
 4190342     4.43
 4190343     0.55
 4190344     5.07
 4190345     6.98
 Name: target, Length: 4190314, dtype: float64,
 5237892)

In [48]:
X_len = int(len(X) * 0.2)
y_len = int(len(y) * 0.2)

X_train, X_valid = X.iloc[:-X_len], X.iloc[-X_len:]
y_train, y_valid = y[:-y_len], y[-y_len:]

In [49]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

## 2.2. Simplest LightGBM Solution

In [50]:
#env = optiver2023.make_env()
#iter_test = env.iter_test()

In [51]:
#model = lightgbm.LGBMRegressor(
#    random_state = SEED, 
#    objective = 'mae', 
#    device_type = 'gpu'
#)

#model.fit(X_train, y_train)

#counter = 0
#for (test, revealed_targets, sample_prediction) in iter_test:
#    sample_prediction['target'] = model.predict(test.drop('row_id', axis = 1))
#    env.predict(sample_prediction)
#    counter += 1

In [57]:
#config lgbm 
params = {
    'random_state': SEED,
    'device_type': 'GPU',
    'max_depth': 7,
    'num_leaves': 31,
    'n_estimators': 400,
    'objective': 'regression',
    'learning_rate': 1e-2,
    'verbose': 0
}

In [61]:
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    early_stopping_rounds=50
)

[1]	training's l2: 91.4073	valid_1's l2: 81.0129
Training until validation scores don't improve for 50 rounds
[2]	training's l2: 91.3781	valid_1's l2: 80.9925
[3]	training's l2: 91.3494	valid_1's l2: 80.9724
[4]	training's l2: 91.3216	valid_1's l2: 80.9532
[5]	training's l2: 91.2938	valid_1's l2: 80.934
[6]	training's l2: 91.267	valid_1's l2: 80.9154
[7]	training's l2: 91.2407	valid_1's l2: 80.8973
[8]	training's l2: 91.2147	valid_1's l2: 80.8791
[9]	training's l2: 91.1889	valid_1's l2: 80.8624
[10]	training's l2: 91.1638	valid_1's l2: 80.845
[11]	training's l2: 91.1396	valid_1's l2: 80.828
[12]	training's l2: 91.1152	valid_1's l2: 80.8118
[13]	training's l2: 91.0912	valid_1's l2: 80.7969
[14]	training's l2: 91.0681	valid_1's l2: 80.7812
[15]	training's l2: 91.0449	valid_1's l2: 80.7669
[16]	training's l2: 91.0226	valid_1's l2: 80.7514
[17]	training's l2: 91.0003	valid_1's l2: 80.7362
[18]	training's l2: 90.979	valid_1's l2: 80.7217
[19]	training's l2: 90.9574	valid_1's l2: 80.7083
[20

In [64]:
y_pred_valid = model.predict(X_valid)

y_pred_valid = np.nan_to_num(y_pred_valid)
y_valid = np.nan_to_num(y_valid)
mae = mean_absolute_error(y_valid, y_pred_valid)
print(f"score MAE {mae}")

score MAE 5.99578224263733


In [22]:
#Pour la soumission on instancie l'env et le fichier de test
env = optiver2023.make_env()
iter_test = env.iter_test()

In [23]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    sample_prediction['target'] = model.predict(test.drop('row_id', axis = 1))
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


## 2.3. Improved LightGBM Solution

In [None]:
# def calculate_imbalance_features(df):
#     # Calculate and add imbalance feature 1 (imb_s1)
#     df['imb_s1'] = df.eval('(bid_size - ask_size) / (bid_size + ask_size)')  

#     # Calculate and add imbalance feature 2 (imb_s2)
#     df['imb_s2'] = df.eval('(imbalance_size - matched_size) / (matched_size + imbalance_size)') 

#     return df

In [None]:
# estimator = lightgbm.LGBMRegressor(
#     boosting_type='gbdt', 
#     num_leaves=31, 
#     max_depth=-1, 
#     learning_rate=0.1, 
#     n_estimators=100, 
#     subsample_for_bin=200000, 
#     objective='mae', 
#     class_weight=None, 
#     min_split_gain=0.0, 
#     min_child_weight=0.001, 
#     min_child_samples=20, 
#     subsample=1.0, 
#     subsample_freq=0, 
#     colsample_bytree=1.0, 
#     reg_alpha=0.0, 
#     reg_lambda=0.0, 
#     random_state=SEED, 
#     n_jobs=-1, 
#     importance_type='split',
#     force_row_wise=True
# )

# val_predictions = np.zeros(len(X_train))
# val_scores = []

# splitter = sk.model_selection.TimeSeriesSplit(5).split(X_train, y_train)

# for fold, (train_idx, val_idx) in enumerate(splitter):
#     model = sk.base.clone(estimator)

#     # Define train and val set
#     X_train = X_train.iloc[train_idx]
#     y_train = y_train.iloc[train_idx]
#     X_val = X_train.iloc[val_idx]
#     y_val = y_train.iloc[val_idx]

#     model.fit(X_train, y_train)
#     val_scores.append(sk.metrics.mean_absolute_error(model.predict(X_val), y_val))

# print(f'Val Score: {np.mean(val_scores):.2f} ± {np.std(val_scores):.5f}')

In [None]:
# import optiver2023

# env = optiver2023.make_env()
# iter_test = env.iter_test()

# model.fit(X_train, y_train)

# counter = 0
# for (test, revealed_targets, sample_prediction) in iter_test:
#     sample_prediction['target'] = model.predict(test.drop('row_id', axis = 1))
#     env.predict(sample_prediction)
#     counter += 1

# 3. Final Thoughts

In [None]:
END = pd.Timestamp.now()
time_elapsed = (END - START).total_seconds()
print(f"Notebook Total Time: {time_elapsed:.2f}s")