In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 

In [2]:
from numpy.lib.stride_tricks import sliding_window_view
def compute_return_numeric(price: pd.Series, ts: pd.Series) -> pd.Series:
    """
    ΔP / Δt où ts est déjà un float64 (ex. secondes depuis epoch).
    """
    return price.diff() / ts.diff()

def create_dataset_vectorized_numeric(
    features_BTC: pd.DataFrame,
    price_ETH:    pd.DataFrame,
    tau_BTC:      int,
    tau_ETH:      int,
    tau_pred:     int,
):
    """
    X : (N, tau_BTC * k + tau_ETH)
    y : (N,)
    Tous les timestamps sont supposés float64 (ou seront castés).
    """

    # ------------------------------------------------------------------ #
    # 0.  Cast en float64 & tri                                           #
    # ------------------------------------------------------------------ #

    features_BTC.sort_values("timestamp", inplace=True, ignore_index=True)
    price_ETH.sort_values("timestamp",    inplace=True, ignore_index=True)

    # ------------------------------------------------------------------ #
    # 1.  Retour ETH + cible future                                       #
    # ------------------------------------------------------------------ #
    ret_eth = compute_return_numeric(price_ETH["price"], price_ETH["timestamp"])
    ret_eth.name = "return"

    df_ret = pd.concat([price_ETH["timestamp"], ret_eth], axis=1)

    df_ret["target"] = (
        df_ret["return"]
        .rolling(window=tau_pred, min_periods=tau_pred)
        .mean()
        .shift(-tau_pred + 1)
    )
    df_ret = df_ret.dropna(subset=["target"]).reset_index(drop=True)

    ts_y = df_ret["timestamp"].to_numpy()
    y    = df_ret["target"].to_numpy()

    # ------------------------------------------------------------------ #
    # 2.  Fenêtres BTC                                                   #
    # ------------------------------------------------------------------ #
    btc_mat = features_BTC.drop(columns="timestamp").to_numpy()          # (M_btc, k)
    ts_btc  = features_BTC["timestamp"].to_numpy()                       # (M_btc,)

    win_btc = sliding_window_view(
        btc_mat, window_shape=(tau_BTC, btc_mat.shape[1])
    )[:, 0, :, :]                                                        # (⋯, tau_BTC, k)
    win_btc_flat = win_btc.reshape(win_btc.shape[0], -1)                 # (⋯, tau_BTC*k)
    ts_btc_win   = ts_btc[tau_BTC - 1 :]                                 # timestamp fin fenêtre

    idx_btc   = np.searchsorted(ts_btc_win, ts_y, side="right") - 1
    valid_btc = idx_btc >= 0

    # ------------------------------------------------------------------ #
    # 3.  Fenêtres ETH (historique)                                      #
    # ------------------------------------------------------------------ #
    ret_full = ret_eth.fillna(0.0).to_numpy()
    win_eth  = sliding_window_view(ret_full, window_shape=tau_ETH)       # (M_eth, tau_ETH)

    ts_eth   = price_ETH["timestamp"].to_numpy()
    pos_y    = np.searchsorted(ts_eth, ts_y)                             # index de t_y
    idx_eth  = pos_y - tau_ETH
    valid_eth = idx_eth >= 0

    # ------------------------------------------------------------------ #
    # 4.  Masque et assemblage                                           #
    # ------------------------------------------------------------------ #
    mask = valid_btc & valid_eth
    if not np.any(mask):
        raise ValueError("Aucun point ne possède suffisamment d'historique.")

    X_btc = win_btc_flat[idx_btc[mask]]
    X_eth = win_eth[idx_eth[mask]]

    X = np.hstack([X_btc, X_eth])
    y = y[mask]

    return X, y

In [3]:
features_BTC = pd.read_parquet('../data/features/DATA_0/XBT_EUR.parquet')
features_ETH = pd.read_parquet('../data/features/DATA_0/ETH_EUR.parquet')

In [4]:
features_BTC.head(n=100)

Unnamed: 0_level_0,level-1-bid-price,level-1-bid-volume,level-2-bid-price,level-2-bid-volume,level-3-bid-price,level-3-bid-volume,level-4-bid-price,level-4-bid-volume,level-5-bid-price,level-5-bid-volume,...,vwap-bid-5-levels,vwap-ask-5-levels,avg-vwap-diff-5-levels,liquidity-ratio,rate-inst-volatility,rate-momentum,rate-mid-price-trend,rate-vwap-diff-5-levels,rate-bid-volume-level-1,rate-ask-volume-level-1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.748488e+09,96207.1,0.831635,96205.7,2.686969,96204.4,2.310522,96203.500000,0.071820,96202.30000,2.310572,...,0.0,0.0,0.0,0.952914,,,,0.0,,
1.748488e+09,96207.1,0.831635,96205.7,2.686969,96204.4,2.310522,96203.500000,0.071820,96202.30000,2.310572,...,0.0,0.0,0.0,1.098961,,,,0.0,,
1.748488e+09,96207.1,0.831635,96205.7,2.686969,96204.4,2.310522,96203.500000,0.071820,96202.30000,2.310572,...,0.0,0.0,0.0,1.098961,,,,0.0,,
1.748488e+09,96207.1,0.831635,96205.7,2.686969,96204.4,2.310522,96203.500000,0.071820,96202.30000,2.310572,...,0.0,0.0,0.0,0.841131,,,,0.0,,
1.748488e+09,96207.1,0.831635,96205.7,2.686969,96204.4,2.310522,96203.500000,0.071820,96202.30000,2.310572,...,0.0,0.0,0.0,1.128582,,,,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1.748488e+09,96207.1,0.831635,96204.6,0.376426,96202.6,0.071820,96201.600000,2.310589,96199.20000,2.310648,...,0.0,0.0,0.0,1.146840,2.229034e-23,0.0,96207.15,0.0,0.831635,0.128075
1.748488e+09,96207.1,0.831635,96204.6,0.376426,96202.6,0.071820,96201.600000,2.310589,96199.20000,2.310648,...,0.0,0.0,0.0,1.146840,2.229034e-23,0.0,96207.15,0.0,0.831635,0.128075
1.748488e+09,96207.1,0.831635,96204.6,0.376426,96202.6,0.071820,96201.600000,2.310589,96199.20000,2.310648,...,0.0,0.0,0.0,1.146840,2.229034e-23,0.0,96207.15,0.0,0.831635,0.128075
1.748488e+09,96207.1,0.831635,96204.6,0.376426,96202.6,0.071820,96201.600000,2.310589,96199.20000,2.310648,...,0.0,0.0,0.0,0.833560,2.229034e-23,0.0,96207.15,0.0,0.831635,0.128075


In [5]:
features_ETH.head()

Unnamed: 0_level_0,level-1-bid-price,level-1-bid-volume,level-2-bid-price,level-2-bid-volume,level-3-bid-price,level-3-bid-volume,level-4-bid-price,level-4-bid-volume,level-5-bid-price,level-5-bid-volume,...,vwap-bid-5-levels,vwap-ask-5-levels,avg-vwap-diff-5-levels,liquidity-ratio,rate-inst-volatility,rate-momentum,rate-mid-price-trend,rate-vwap-diff-5-levels,rate-bid-volume-level-1,rate-ask-volume-level-1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1748488000.0,2453.02,4.5,2452.94,36.247509,2452.77,36.249957,2452.61,36.252261,2452.54,19.14,...,0.0,0.0,0.0,1.755425,,,,0.0,,
1748488000.0,2453.02,4.5,2452.94,36.247509,2452.77,36.249957,2452.61,36.252261,2452.54,19.14,...,0.0,0.0,0.0,3.109557,,,,0.0,,
1748488000.0,2453.02,4.5,2452.94,36.247509,2452.77,36.249957,2452.61,36.252261,2452.54,19.14,...,0.0,0.0,0.0,1.721685,,,,0.0,,
1748488000.0,2453.02,4.5,2452.99,36.246716,2452.94,36.247509,2452.77,36.249957,2452.61,36.252261,...,0.0,0.0,0.0,1.944153,,,,0.0,,
1748488000.0,2453.02,4.5,2452.99,36.246716,2452.94,36.247509,2452.89,1.924,2452.77,36.249957,...,0.0,0.0,0.0,1.497725,,,,0.0,,


In [6]:
tau_BTC = 10
tau_ETH = 10
tau_prediction = 5

price_ETH = features_ETH['level-1-bid-price']

In [7]:
price_ETH = price_ETH.reset_index()  # transforme l’index en colonne
price_ETH.columns = ['timestamp', 'price']  # renomme proprement
price_ETH

Unnamed: 0,timestamp,price
0,1.748488e+09,2453.02000
1,1.748488e+09,2453.02000
2,1.748488e+09,2453.02000
3,1.748488e+09,2453.02000
4,1.748488e+09,2453.02000
...,...,...
1817680,1.748523e+09,2376.28000
1817681,1.748523e+09,2376.19446
1817682,1.748523e+09,2375.88000
1817683,1.748523e+09,2375.89000


In [8]:
features_BTC.columns.to_list()[39:]

['level-10-ask-volume',
 'V-bid-5-levels',
 'V-ask-5-levels',
 'bid-ask-imbalance-5-levels',
 'spread',
 'slope-bid-5-levels',
 'slope-ask-5-levels',
 'vwap-bid-5-levels',
 'vwap-ask-5-levels',
 'avg-vwap-diff-5-levels',
 'liquidity-ratio',
 'rate-inst-volatility',
 'rate-momentum',
 'rate-mid-price-trend',
 'rate-vwap-diff-5-levels',
 'rate-bid-volume-level-1',
 'rate-ask-volume-level-1']

In [9]:
features_BTC = features_BTC.reset_index()
features_BTC = features_BTC.loc[:, ['timestamp',
 'V-bid-5-levels',
 'V-ask-5-levels',
 'bid-ask-imbalance-5-levels',
 'spread',
 'slope-bid-5-levels',
 'slope-ask-5-levels',
 'vwap-bid-5-levels',
 'vwap-ask-5-levels',
 'avg-vwap-diff-5-levels',
 'liquidity-ratio',
 'rate-inst-volatility',
 'rate-momentum',
 'rate-mid-price-trend',
 'rate-vwap-diff-5-levels',
 'rate-bid-volume-level-1',
 'rate-ask-volume-level-1']]
features_BTC

Unnamed: 0,timestamp,V-bid-5-levels,V-ask-5-levels,bid-ask-imbalance-5-levels,spread,slope-bid-5-levels,slope-ask-5-levels,vwap-bid-5-levels,vwap-ask-5-levels,avg-vwap-diff-5-levels,liquidity-ratio,rate-inst-volatility,rate-momentum,rate-mid-price-trend,rate-vwap-diff-5-levels,rate-bid-volume-level-1,rate-ask-volume-level-1
0,1.748488e+09,8.211519,8.617270,-0.024111,0.1,-0.584545,0.940570,0.0,0.0,0.0,0.952914,,,,0.0,,
1,1.748488e+09,8.211519,7.472074,0.047148,0.1,-0.584545,1.177719,0.0,0.0,0.0,1.098961,,,,0.0,,
2,1.748488e+09,8.211519,7.472074,0.047148,0.1,-0.584545,1.177719,0.0,0.0,0.0,1.098961,,,,0.0,,
3,1.748488e+09,8.211519,9.762472,-0.086289,0.1,-0.584545,0.829708,0.0,0.0,0.0,0.841131,,,,0.0,,
4,1.748488e+09,8.211519,7.275961,0.060407,0.1,-0.584545,0.980247,0.0,0.0,0.0,1.128582,,,,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563390,1.748523e+09,6.944440,9.743071,-0.167708,0.1,-1.368001,0.379757,0.0,0.0,0.0,0.712757,-1.118703e-07,0.000000e+00,95772.05,0.0,0.033861,0.529792
563391,1.748523e+09,9.247813,8.007988,0.071850,0.1,-0.865069,0.495702,0.0,0.0,0.0,1.154823,-1.118703e-07,0.000000e+00,95772.05,0.0,0.033861,0.529792
563392,1.748523e+09,8.096060,6.011780,0.147739,0.1,-0.988139,1.014675,0.0,0.0,0.0,1.346699,-1.118703e-07,0.000000e+00,95772.05,0.0,0.033861,0.529792
563393,1.748523e+09,9.247813,8.269100,0.055872,0.1,-0.865069,0.556288,0.0,0.0,0.0,1.118358,-1.118703e-07,0.000000e+00,95772.05,0.0,0.033861,0.529792


In [10]:
len(features_BTC), len(price_ETH)

(563395, 1817685)

In [11]:
print("BTC :", features_BTC["timestamp"].min(), "→", features_BTC["timestamp"].max())
print("ETH :", price_ETH   ["timestamp"].min(), "→", price_ETH   ["timestamp"].max())


BTC : 1748488448.889497 → 1748523002.355281
ETH : 1748488448.894009 → 1748523002.34935


In [None]:
features_BTC

Unnamed: 0,timestamp,V-bid-5-levels,V-ask-5-levels,bid-ask-imbalance-5-levels,spread,slope-bid-5-levels,slope-ask-5-levels,vwap-bid-5-levels,vwap-ask-5-levels,avg-vwap-diff-5-levels,liquidity-ratio,rate-inst-volatility,rate-momentum,rate-mid-price-trend,rate-vwap-diff-5-levels,rate-bid-volume-level-1,rate-ask-volume-level-1
0,1.748488e+09,8.211519,8.617270,-0.024111,0.1,-0.584545,0.940570,0.0,0.0,0.0,0.952914,,,,0.0,,
1,1.748488e+09,8.211519,7.472074,0.047148,0.1,-0.584545,1.177719,0.0,0.0,0.0,1.098961,,,,0.0,,
2,1.748488e+09,8.211519,7.472074,0.047148,0.1,-0.584545,1.177719,0.0,0.0,0.0,1.098961,,,,0.0,,
3,1.748488e+09,8.211519,9.762472,-0.086289,0.1,-0.584545,0.829708,0.0,0.0,0.0,0.841131,,,,0.0,,
4,1.748488e+09,8.211519,7.275961,0.060407,0.1,-0.584545,0.980247,0.0,0.0,0.0,1.128582,,,,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563390,1.748523e+09,6.944440,9.743071,-0.167708,0.1,-1.368001,0.379757,0.0,0.0,0.0,0.712757,-1.118703e-07,0.000000e+00,95772.05,0.0,0.033861,0.529792
563391,1.748523e+09,9.247813,8.007988,0.071850,0.1,-0.865069,0.495702,0.0,0.0,0.0,1.154823,-1.118703e-07,0.000000e+00,95772.05,0.0,0.033861,0.529792
563392,1.748523e+09,8.096060,6.011780,0.147739,0.1,-0.988139,1.014675,0.0,0.0,0.0,1.346699,-1.118703e-07,0.000000e+00,95772.05,0.0,0.033861,0.529792
563393,1.748523e+09,9.247813,8.269100,0.055872,0.1,-0.865069,0.556288,0.0,0.0,0.0,1.118358,-1.118703e-07,0.000000e+00,95772.05,0.0,0.033861,0.529792


: 

In [None]:
X, y = create_dataset_vectorized_numeric(features_BTC, price_ETH, tau_BTC, tau_ETH, tau_prediction)

In [None]:
print(X.size())
print(y.size())

array([[ 8.21151910e+00,  8.61727011e+00, -2.41105290e-02, ...,
        -8.71444786e-12,  0.00000000e+00,  0.00000000e+00],
       [ 8.21151910e+00,  8.61727011e+00, -2.41105290e-02, ...,
         0.00000000e+00,  0.00000000e+00, -1.97211282e-11],
       [ 8.21151910e+00,  8.61727011e+00, -2.41105290e-02, ...,
         0.00000000e+00, -1.97211282e-11,  0.00000000e+00],
       ...,
       [ 5.90323430e+00,  2.37952983e+00,  4.25426152e-01, ...,
         0.00000000e+00, -8.92405106e+02,  1.04726692e+03],
       [ 5.90323430e+00,  2.37952983e+00,  4.25426152e-01, ...,
        -8.92405106e+02,  1.04726692e+03,  3.34660815e+00],
       [ 5.90323430e+00,  2.37952983e+00,  4.25426152e-01, ...,
         1.04726692e+03,  3.34660815e+00,  0.00000000e+00]],
      shape=(99723, 170))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


# 2. Modèle Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

mse = mean_squared_error(X, y)
r2 = r2_score(X, y)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")