## Imports

In [None]:
!git clone https://github.com/rmnigm/qber-forecasting.git

In [None]:
!pip install catboost

In [1]:
import collections
import math
import os
import pathlib
import random


import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import seaborn as sns
import scipy.stats as sps

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from tqdm.notebook import tqdm

In [2]:
seed = 123456
random.seed(seed)
np.random.seed(seed)

## Datasets

In [3]:
def calculate_offset_limit(offset, limit, length) -> tuple[int, int]:
    if offset is None:
        offset = 0
    else:
        offset = offset if offset >= 1 else int(offset * length)
    if limit is None:
        limit = length
    else:
        limit = limit if limit >= 1 else int(limit * length)
    return offset, limit

In [4]:
def build(
    data_path: str | pathlib.Path,
    window_size: int,
    dtype: np.dtype = np.float32,
    columns: list[str] | None = None,
    offset: int | float = None,
    limit: int | float = None) -> tuple[np.ndarray, np.ndarray]:
    dataframe = pl.scan_csv(data_path)
    length = dataframe.select(pl.count()).collect().item()
    offset, limit = calculate_offset_limit(offset, limit, length)
    columns = columns or dataframe.columns
    dataframe = (
        dataframe
        .select(columns)
        .slice(offset, limit)
    )
    data_array = dataframe.collect().to_numpy()
    dataset = np.lib.stride_tricks.sliding_window_view(
        data_array,
        window_size + 1,
        axis=0
        )
    return dataset

In [6]:
# qber_path = pathlib.Path('qber-forecasting')
qber_path = pathlib.Path('../..')

columns = ['e_mu_current', 'e_mu_estimated', 'e_nu_1', 'e_nu_2', 'q_mu', 'q_nu1', 'q_nu2']

In [7]:
train_data = build(qber_path / 'datasets' / 'data.csv', 30, columns=columns, limit=0.75)
test_data = build(qber_path / 'datasets' / 'data.csv', 30, columns=columns, offset=0.75)

train_x, train_y = train_data[:, 0, :-1], train_data[:, 0, -1]
test_x, test_y = test_data[:, 0, :-1], test_data[:, 0, -1]

In [8]:
data = build(qber_path / 'datasets' / 'data.csv', 30, columns=columns)
X, y = data[:, 0, :-1], data[:, 0, -1]

In [11]:
model = LinearRegression()
tscv = TimeSeriesSplit(n_splits=5)
cross_validate(model, X, y, scoring=['r2', 'neg_mean_squared_error', 'neg_root_mean_squared_error'], cv=tscv)

{'fit_time': array([0.03069901, 0.26828003, 0.25461388, 0.25047183, 0.25718117]),
 'score_time': array([0.04035902, 0.05623603, 0.05789328, 0.06686497, 0.05307984]),
 'test_r2': array([0.78152577, 0.68028839, 0.46324367, 0.84042702, 0.63877677]),
 'test_neg_mean_squared_error': array([-1.37742247e-05, -2.36381610e-06, -1.40567231e-06, -1.16394270e-05,
        -1.11401251e-05]),
 'test_neg_root_mean_squared_error': array([-0.00371136, -0.00153747, -0.00118561, -0.00341166, -0.00333768])}

In [145]:
class CompositeModel(RegressorMixin, BaseEstimator):
    def __init__(self, boost_type: str = 'lgb'):
        super().__init__()
        self.base = LinearRegression()
        self.boost_type = boost_type
        assert self.boost_type in ('lgb', 'cb')
        if self.boost_type == 'lgb':
            self.boost = LGBMRegressor(verbose=-1)
        elif self.boost_type == 'cb':
            self.boost = CatBoostRegressor(verbose=False)

    def fit(self, X, y):
        self.base.fit(X, y)
        predictions = self.base.predict(X)
        diff = y - predictions
        self.boost.fit(X, diff)

    def predict(self, X):
        return self.base.predict(X) + self.boost.predict(X)

In [146]:
model = CompositeModel('lgb')
tscv = TimeSeriesSplit(n_splits=5)
cross_validate(model, X, y, scoring=['r2', 'neg_mean_squared_error', 'neg_root_mean_squared_error'], cv=tscv)

{'fit_time': array([1.08185792, 2.12320447, 5.41279435, 3.1386404 , 4.22732806]),
 'score_time': array([0.29574299, 1.15385747, 0.30767798, 0.29849815, 0.34766555]),
 'test_r2': array([0.79445223, 0.72936555, 0.47727244, 0.85753899, 0.70282349]),
 'test_neg_mean_squared_error': array([-1.29592454e-05, -2.00095976e-06, -1.36893337e-06, -1.03912612e-05,
        -9.16492422e-06]),
 'test_neg_root_mean_squared_error': array([-0.0035999 , -0.00141455, -0.00117001, -0.00322355, -0.00302736])}

In [147]:
for model_type in ('lgb', 'cb'):
    model = CompositeModel(model_type)
    model.fit(train_x, train_y)
    train_preds = model.predict(train_x)
    test_preds = model.predict(test_x)
    print(model.boost)
    print(f'train R2 = {r2_score(train_y, train_preds):.7f}')
    print(f'test R2 = {r2_score(test_y, test_preds):.7f}')
    print(f'train MSE = {mean_squared_error(train_y, train_preds):.7f}')
    print(f'test MSE = {mean_squared_error(test_y, test_preds):.7f}')
    print(f'train RMSE = {mean_squared_error(train_y, train_preds, squared=False):.7f}')
    print(f'test RMSE = {mean_squared_error(test_y, test_preds, squared=False):.7f}')
    print('-' * 60)

LGBMRegressor(verbose=-1)
train R2 = 0.8855169
test R2 = 0.8148065
train MSE = 0.0000021
test MSE = 0.0000127
train RMSE = 0.0014446
test RMSE = 0.0035682
------------------------------------------------------------
<catboost.core.CatBoostRegressor object at 0x7f2c340bfa90>
train R2 = 0.9330754
test R2 = 0.8215365
train MSE = 0.0000012
test MSE = 0.0000123
train RMSE = 0.0011045
test RMSE = 0.0035028
------------------------------------------------------------
