# Jane Street Real-Time Market Data Forecasting
## - using lgbm with optuna and k-fold

### Importing libraries

In [None]:
import optuna
import numpy as np
import polars as pl
import pandas as pd
import lightgbm as lgb
import os
import joblib
import kaggle_evaluation.jane_street_inference_server

In [None]:
TARGET = 'responder_6'
FEAT_COLS = [f"feature_{i:02d}" for i in range(79)]

### Applying filters to the data

In [None]:
def load_data(date_id_range=None, time_id_range=None, columns=None, return_type='pl'):
    data_dir = '../input/jane-street-real-time-market-data-forecasting' # Depending on the coding environment
    data = pl.scan_parquet(f"{data_dir}/train.parquet")

    if date_id_range is not None:
        start_date, end_date = date_id_range
        data = data.filter((pl.col("date_id") >= start_date) & (pl.col("date_id") <= end_date))

    if time_id_range is not None:
        start_time, end_time = time_id_range
        data = data.filter((pl.col("time_id") >= start_time) & (pl.col("time_id") <= end_time))

    if columns is not None:
        data = data.select(columns)

    if return_type == 'pd':
        return data.collect().to_pandas()
    else:
        return data.collect()

In [None]:
def calculate_r2(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return r2_score

In [None]:
class ModelGroup:
    def __init__(self):
        self.models = []

    def add_model(self, model):
        self.models.append(model)

    def predict(self, test_data):
        preds = []
        for model in self.models:
            pred = model.predict(test_data[FEAT_COLS])
            preds.append(pred)
        avg_pred = np.mean(preds, axis=0)
        return avg_pred

    @classmethod
    def load(cls, file_path):
        """Load a model group from a file."""
        model_group = joblib.load(file_path)
        return model_group

### Hyperparameter tuning with Optuna and training using k-fold

In [None]:
def train_lgb_kfold_optuna(total_days=1499, n_splits=5):
    fold_size = total_days // n_splits
    folds = [(i * fold_size, min((i + 1) * fold_size - 1, total_days - 1)) for i in range(n_splits)]
    
    model_group = ModelGroup()

    def objective(trial):
        valid_range = folds[0]
        train_ranges = [folds[i] for i in range(1, n_splits)]

        valid_data = load_data(date_id_range=valid_range, columns=["date_id", "weight"] + FEAT_COLS + [TARGET], return_type='pl')
        valid_weight = valid_data['weight'].to_pandas()

        train_data = None
        for train_range in train_ranges:
            partial_train_data = load_data(date_id_range=train_range, columns=["date_id", "weight"] + FEAT_COLS + [TARGET], return_type='pl')
            if train_data is None:
                train_data = partial_train_data
            else:
                train_data = train_data.vstack(partial_train_data)

        train_weight = train_data['weight'].to_pandas()

        train_ds = lgb.Dataset(train_data.select(FEAT_COLS+['weight']).to_pandas(), label=train_data[TARGET].to_pandas(), weight=train_weight)
        valid_ds = lgb.Dataset(valid_data.select(FEAT_COLS+['weight']).to_pandas(), label=valid_data[TARGET].to_pandas(), weight=valid_weight, reference=train_ds)

        LGB_PARAMS = {
            'objective': 'regression_l2',
            'metric': 'rmse',
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'max_depth': trial.suggest_int('max_depth', -1, 10),
            'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 10),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10),
            'device': 'gpu',
        }

        model = lgb.train(
            LGB_PARAMS,
            train_ds,
            num_boost_round=500,
            valid_sets=[train_ds, valid_ds],
            valid_names=['train', 'valid'],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)],
        )

        y_valid_pred = model.predict(valid_data.select(FEAT_COLS+['weight']).to_pandas())
        r2_score = calculate_r2(valid_data[TARGET].to_pandas(), y_valid_pred, valid_weight)
        return -r2_score

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50)

    print("Best parameters:", study.best_params)
    return model_group

### Model training

In [None]:
model = train_lgb_kfold_optuna(total_days=1699, n_splits=5)

In [None]:
lags_ : pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    global lags_
    if lags is not None:
        lags_ = lags

    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    
    feat = test[FEAT_COLS+['weight']].to_pandas()

    pred = model.predict(feat)

    predictions = predictions.with_columns(pl.Series('responder_6', pred.ravel()))
    print(predictions)

    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)

    assert list(predictions.columns) == ['row_id', 'responder_6']

    assert len(predictions) == len(test)

    return predictions

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )