In [22]:
import os
os.environ['PYTHONHASHSEED'] = '3888'
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = ''

import numpy as np
import pandas as pd
np.random.seed(3888)

import tensorflow as tf
tf.config.threading.set_intra_op_parallelism_threads(1)
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.random.set_seed(3888)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


In [23]:
def fill_missing_seconds(g):
    tid = g['time_id'].iloc[0]
    full = pd.DataFrame({'seconds_in_bucket': range(600)})
    full['time_id'] = tid
    return full.merge(g, on=['time_id', 'seconds_in_bucket'], how='left').sort_values('seconds_in_bucket').ffill().bfill()


In [24]:
def realized_vol(df_slice):
    mid = (df_slice['bid_price1'] + df_slice['ask_price1']) / 2
    lr = np.log(mid).diff().dropna()
    return lr.std()


In [25]:
def make_features(df, win=60):
    seq_list, stat_list, y_list = [], [], []
    for tid, grp in df.groupby('time_id'):
        grp = grp.sort_values('seconds_in_bucket')
        if len(grp) < 600: continue
        pre = grp[grp['seconds_in_bucket'] < 540]
        post = grp[(grp['seconds_in_bucket'] >= 540) & (grp['seconds_in_bucket'] < 600)]
        windows = []
        for i in range(0, 540, win):
            w = pre.iloc[i:i+win]
            vol = realized_vol(w)
            spr = (w['ask_price1'] - w['bid_price1']).mean()
            imb = ((w['bid_size1'] - w['ask_size1']) / (w['bid_size1'] + w['ask_size1'] + 1e-9)).mean()
            dep = w[['bid_size1','ask_size1','bid_size2','ask_size2']].sum(axis=1).mean()
            windows.append([vol, spr, imb, dep])
        seq_arr = np.array(windows)
        vols = seq_arr[:, 0]
        trend = np.diff(vols)
        trend2 = np.diff(trend)
        mid_series = (pre['bid_price1'] + pre['ask_price1']) / 2
        spread_series = (pre['ask_price1'] - pre['bid_price1'])
        depth_series = pre[['bid_size1','ask_size1','bid_size2','ask_size2']].sum(axis=1)
        static = np.concatenate([
            trend, trend2,
            [mid_series.mean(), mid_series.std()],
            [spread_series.mean(), spread_series.std()],
            [depth_series.mean(), depth_series.sum()],
            [np.log(mid_series.iloc[-1]+1e-9) - np.log(mid_series.iloc[-2]+1e-9)]
        ])
        seq_list.append(seq_arr)
        stat_list.append(static)
        y_list.append(realized_vol(post))
    return np.stack(seq_list), np.stack(stat_list), np.array(y_list)


In [26]:
def build_lstm(seq_len, d_model, static_dim):
    seq_in = Input(shape=(seq_len, d_model), name='seq_input')
    x = LSTM(64)(seq_in)
    stat_in = Input(shape=(static_dim,), name='static_input')
    merged = Concatenate()([x, stat_in])
    out = Dense(32, activation='relu')(merged)
    out = Dense(1, activation='linear')(out)
    return Model([seq_in, stat_in], out)


In [27]:
def main():
    stock_id = 'stock_20'
    data_dir = 'individual_book_train'
    test_size = 0.10
    epochs = 150
    batch_size = 32

    df = pd.read_csv(os.path.join(data_dir, f'{stock_id}.csv'))
    df = df.groupby('time_id', group_keys=False).apply(fill_missing_seconds)

    seq, stat, y_raw = make_features(df)
    y = np.log(y_raw + 1e-6) * 1e4
    y_scaler = StandardScaler()
    y = y_scaler.fit_transform(y.reshape(-1, 1)).flatten()

    N, L, D = seq.shape
    seq = StandardScaler().fit_transform(seq.reshape(-1, D)).reshape(N, L, D)
    stat = StandardScaler().fit_transform(stat)

    seq_tr, seq_te, stat_tr, stat_te, y_tr, y_te = train_test_split(
        seq, stat, y, test_size=test_size, random_state=3888
    )

    model = build_lstm(seq_len=L, d_model=D, static_dim=stat.shape[1])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    es = EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True)
    rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=6)

    model.fit(
        {'seq_input': seq_tr, 'static_input': stat_tr}, y_tr,
        validation_data=({'seq_input': seq_te, 'static_input': stat_te}, y_te),
        epochs=epochs, batch_size=batch_size,
        callbacks=[es, rlr], verbose=0
    )

    y_pred_std = model.predict({'seq_input': seq_te, 'static_input': stat_te}, verbose=0).flatten()
    y_pred_raw = np.exp(y_scaler.inverse_transform(y_pred_std.reshape(-1,1)).flatten() / 1e4) - 1e-6
    y_te_raw = np.exp(y_scaler.inverse_transform(y_te.reshape(-1,1)).flatten() / 1e4) - 1e-6

    r2 = r2_score(y_te_raw, y_pred_raw)
    mae_raw = np.mean(np.abs(y_pred_raw - y_te_raw))

    print(f"Test MAE: {mae_raw:.6f}")
    print(f"Test R²:  {r2:.5f}")

    for t, p in zip(y_te_raw[:5], y_pred_raw[:5]):
        print(f'True: {t:.6f} | Pred: {p:.6f}')


In [28]:
if __name__ == "__main__":
    main()


  df = df.groupby('time_id', group_keys=False).apply(fill_missing_seconds)


Test MAE: 0.000019
Test R²:  0.84352
True: 0.000041 | Pred: 0.000041
True: 0.000017 | Pred: 0.000030
True: 0.000046 | Pred: 0.000043
True: 0.000071 | Pred: 0.000059
True: 0.000067 | Pred: 0.000071
