In [3]:
# Cell 1
import os, gc, math, time
import numpy as np
import pandas as pd
from datetime import timezone
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
import tensorflow as tf

# Basic config - edit paths
RAW_CSV = "E:/DL Project/data/lob.csv"   # change to your original file path
CLEAN_CSV = "E:/DL Project/data/cleaned.csv"  # where cleaned file will be saved
os.makedirs(os.path.dirname(CLEAN_CSV), exist_ok=True)

# Parameters you may want to tune
WINDOW_SIZE = 300   # deepLOB default
HORIZON = 1         # label horizon (how many steps ahead to compare)
RET_THRESHOLD = 1e-5  # threshold for "no-move" vs up/down (relative)


In [7]:

import numpy as np
import pandas as pd

RAW_CSV = "E:/DL Project/data/lob.csv"  
df = pd.read_csv(RAW_CSV, low_memory=False)
print("Initial shape:", df.shape)
print("Columns:", df.columns.tolist()[:15], " ... total", len(df.columns))

# 1) drop the extra index column if present
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])
    print("Dropped 'Unnamed: 0'")

# 2) detect timestamp column: prefer readable col '1' if present
ts_col = None
if 1 in df.columns and df[1].dtype == object:
    ts_col = 1
else:
    # try to detect by parse-ability for object columns
    for c in df.columns:
        if df[c].dtype == object:
            try:
                _ = pd.to_datetime(df[c].iloc[0])
                ts_col = c
                break
            except:
                continue
    # fallback: numeric large-value column (unix ms)
    if ts_col is None:
        for c in df.columns:
            if np.issubdtype(df[c].dtype, np.number):
                val = float(df[c].iloc[0])
                if 1e12 < val < 2e13:   # likely milliseconds
                    ts_col = c
                    break
                if 1e9 < val < 2e10:    # likely seconds
                    ts_col = c
                    break

if ts_col is None:
    raise ValueError("Could not automatically detect timestamp column. Print df.columns and tell me the timestamp column name.")
print("Using timestamp column:", ts_col)

# 3) parse timestamps into datetime UTC
if df[ts_col].dtype == object:
    df['timestamp'] = pd.to_datetime(df[ts_col], utc=True, infer_datetime_format=True)
else:
    # numeric; decide ms vs s
    example = float(df[ts_col].iloc[0])
    if example > 1e12:
        df['timestamp'] = pd.to_datetime(df[ts_col], unit='ms', utc=True)
    else:
        df['timestamp'] = pd.to_datetime(df[ts_col], unit='s', utc=True)

# 4) drop rows with null timestamp and sort
df = df.dropna(subset=['timestamp']).sort_values('timestamp').reset_index(drop=True)
print("After parsing timestamp rows:", df.shape)
print(df['timestamp'].head())

# 5) identify numeric feature columns (exclude timestamp + label if any)
numeric_cols = [c for c in df.columns if c not in ('timestamp', ts_col) and np.issubdtype(df[c].dtype, np.number)]
print("Numeric columns count:", len(numeric_cols))

# 6) heuristically separate price-like vs size-like by magnitude (BTC price ~ thousands, sizes tend to be small)
col_stats = df[numeric_cols].agg(['mean','std','min','max']).transpose()
col_stats['mean_abs'] = col_stats['mean'].abs()
# price-like: mean_abs > 1000 (tweak if necessary). Size-like: mean_abs < 1000
price_cols = col_stats[col_stats['mean_abs'] > 1000].index.tolist()
size_cols  = col_stats[col_stats['mean_abs'] <= 1000].index.tolist()

print("Detected price-like cols (examples):", price_cols[:10])
print("Detected size-like cols (examples):", size_cols[:10])

# 7) compute robust mid_price: (min_price + max_price)/2 among price-like columns in each row
if len(price_cols) >= 2:
    # min and max across the detected price columns per row
    min_price = df[price_cols].min(axis=1)
    max_price = df[price_cols].max(axis=1)
    df['mid_price'] = (min_price + max_price) / 2.0
    print("Computed mid_price from price-like columns.")
else:
    # fallback: if not enough price columns detected, try a simple candidate: take first numeric col after timestamp
    fallback = [c for c in df.columns if c not in ('timestamp', ts_col)]
    fallback_numeric = [c for c in fallback if np.issubdtype(df[c].dtype, np.number)]
    if len(fallback_numeric) >= 2:
        df['mid_price'] = (df[fallback_numeric[0]] + df[fallback_numeric[1]]) / 2.0
        print("Fallback mid_price from first two numeric columns:", fallback_numeric[:2])
    else:
        raise ValueError("Cannot compute mid_price: not enough numeric columns detected.")

# 8) compute simple engineered features: log returns, spread if possible, imbalances if sizes present
df['mid_logret'] = np.log(df['mid_price']).diff().fillna(0)
df['mid_ret'] = df['mid_price'].pct_change().fillna(0)

if len(size_cols) >= 2:
    # create simple imbalance for first size pair (heuristic)
    df['imbalance_1'] = (df[size_cols[0]] - df[size_cols[1]]) / (df[size_cols[0]] + df[size_cols[1]] + 1e-9)
    print("Created imbalance using", size_cols[0], size_cols[1])

# 9) final housekeeping
df = df.reset_index(drop=True)
print("Final df shape:", df.shape)
display(df.head().iloc[:, :12])   # show first 12 columns for quick check

# Save or assign to CLEAN_CSV later in pipeline


Initial shape: (3730870, 43)
Columns: ['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13']  ... total 43
Dropped 'Unnamed: 0'
Using timestamp column: 1


  df['timestamp'] = pd.to_datetime(df[ts_col], utc=True, infer_datetime_format=True)


After parsing timestamp rows: (3730870, 43)
0   2023-01-09 22:17:40+00:00
1   2023-01-09 22:17:41+00:00
2   2023-01-09 22:17:41+00:00
3   2023-01-09 22:17:41+00:00
4   2023-01-09 22:17:41+00:00
Name: timestamp, dtype: datetime64[ns, UTC]
Numeric columns count: 41
Detected price-like cols (examples): ['0', '2', '4', '6', '8', '10', '12', '14', '16', '18']
Detected size-like cols (examples): ['3', '5', '7', '9', '11', '13', '15', '17', '19', '21']
Computed mid_price from price-like columns.
Created imbalance using 3 5
Final df shape: (3730870, 47)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1673302660926,2023-01-09 22:17:40,17181.6,23.371,17181.5,0.746,17181.4,5.428,17181.2,0.89,17181.1,3.787
1,1673302661177,2023-01-09 22:17:41,17181.6,24.232,17181.5,0.694,17181.4,5.428,17181.2,0.89,17181.1,3.787
2,1673302661427,2023-01-09 22:17:41,17181.6,24.403,17181.5,0.694,17181.4,5.428,17181.2,0.89,17181.1,3.787
3,1673302661678,2023-01-09 22:17:41,17181.6,24.874,17181.5,0.694,17181.4,5.428,17181.2,0.89,17181.1,3.776
4,1673302661928,2023-01-09 22:17:41,17181.6,24.403,17181.5,0.694,17181.4,5.428,17181.2,0.89,17181.1,3.776


In [8]:
# Cell A: 3-class labels
HORIZON = 1            # steps ahead to compare
RET_THRESHOLD = 1e-5   # relative change threshold (tweak later)

def make_labels(df, horizon=HORIZON, threshold=RET_THRESHOLD):
    mp = df['mid_price'].values
    mp_future = np.roll(mp, -horizon)
    valid = np.arange(len(df) - horizon)
    ret = (mp_future - mp) / (mp + 1e-12)
    labels = np.full(len(df), -1, dtype=int)
    labels[valid] = np.where(ret[valid] > threshold, 2,
                             np.where(ret[valid] < -threshold, 0, 1))
    return labels

df['label_3'] = make_labels(df, horizon=HORIZON, threshold=RET_THRESHOLD)
print("Label counts (inc unlabeled -1):\n", pd.Series(df['label_3']).value_counts(dropna=False))
# drop tail rows that cannot be labeled
df = df[df['label_3'] != -1].reset_index(drop=True)
print("After dropping unlabeled tail:", df.shape)
print(pd.Series(df['label_3']).value_counts(normalize=True))


Label counts (inc unlabeled -1):
 label_3
 1    3730869
-1          1
Name: count, dtype: int64
After dropping unlabeled tail: (3730869, 48)
label_3
1    1.0
Name: proportion, dtype: float64


In [9]:
df['mid_ret'] = df['mid_price'].pct_change().fillna(0)

print("Return stats:")
print(df['mid_ret'].describe())

# how many rows with absolute return > 1e-5 ?
print("abs(ret) > 1e-5:", (df['mid_ret'].abs() > 1e-5).sum())

# try slightly bigger threshold
for th in [1e-5, 1e-4, 5e-4, 1e-3]:
    c_up = (df['mid_ret'] > th).sum()
    c_down = (df['mid_ret'] < -th).sum()
    print(f"TH={th}: up={c_up}, down={c_down}, neutral={len(df)-c_up-c_down}")


Return stats:
count    3.730869e+06
mean     1.498236e-10
std      2.485995e-10
min     -5.602555e-10
25%      1.493159e-10
50%      1.494114e-10
75%      1.502931e-10
max      5.585684e-08
Name: mid_ret, dtype: float64
abs(ret) > 1e-5: 0
TH=1e-05: up=0, down=0, neutral=3730869
TH=0.0001: up=0, down=0, neutral=3730869
TH=0.0005: up=0, down=0, neutral=3730869
TH=0.001: up=0, down=0, neutral=3730869


In [11]:
# Robust Cell 1 replacement: works whether df.columns are ints or strings
import numpy as np
import pandas as pd

# RAW_CSV already loaded earlier; but to be safe, reload a small head if df not present
# (comment out reload if df is already in memory)
# df = pd.read_csv(RAW_CSV, low_memory=False)

print("Columns sample:", df.columns.tolist()[:12])

# helper to get column name that may be int or str
def colname(x):
    """Return the actual column key in df for logical index x (0-based),
       handling column labels that are ints or their string equivalents."""
    # possible forms
    if x in df.columns:
        return x
    s = str(x)
    if s in df.columns:
        return s
    # sometimes pandas read as 'Unnamed: 0' removed earlier; try integer lookup by position
    try:
        # return label at position x (0-based)
        return df.columns[x]
    except Exception:
        raise KeyError(f"Could not find column for index {x}. Available cols: {df.columns[:12].tolist()}")

# map logical positions -> actual keys
unix_ts_col_key = colname(0)
human_ts_col_key = colname(1)

# build lists for bid/ask columns (logical positions) and map them to real keys
bid_price_positions = list(range(2, 22, 2))   # logical indices 2,4,...20
bid_size_positions  = list(range(3, 23, 2))   # 3,5,...21
ask_price_positions = list(range(22, 42, 2))  # 22,24,...40
ask_size_positions  = list(range(23, 43, 2))  # 23,25,...41

# convert positions to actual df column keys
bid_price_cols = [colname(p) for p in bid_price_positions]
bid_size_cols  = [colname(p) for p in bid_size_positions]
ask_price_cols = [colname(p) for p in ask_price_positions]
ask_size_cols  = [colname(p) for p in ask_size_positions]

print("Mapped columns (sample):")
print(" unix_ts_col_key:", unix_ts_col_key)
print(" human_ts_col_key:", human_ts_col_key)
print(" bid_price_cols:", bid_price_cols[:6])
print(" bid_size_cols :", bid_size_cols[:6])
print(" ask_price_cols:", ask_price_cols[:6])
print(" ask_size_cols :", ask_size_cols[:6])

# parse timestamp: dataset uses microseconds -> unit='us'
try:
    df['timestamp'] = pd.to_datetime(df[unix_ts_col_key], unit='us', utc=True)
except Exception as e:
    # fallback: if human-readable col exists, use that
    print("Failed parsing unix microseconds on column", unix_ts_col_key, "->", e)
    try:
        df['timestamp'] = pd.to_datetime(df[human_ts_col_key], utc=True, infer_datetime_format=True)
    except Exception as e2:
        raise RuntimeError("Could not parse timestamp from either column. Errors:\n1) {}\n2) {}".format(e, e2))

# sort by timestamp
df = df.sort_values('timestamp').reset_index(drop=True)

# ensure numeric types for price columns (coerce if necessary)
for c in bid_price_cols + ask_price_cols + bid_size_cols + ask_size_cols:
    # some columns might be read as strings due to commas etc. coerce to float
    df[c] = pd.to_numeric(df[c], errors='coerce')

# drop rows with NaNs in best bid/ask after coercion
df = df.dropna(subset=[bid_price_cols[0], ask_price_cols[0]]).reset_index(drop=True)

# compute mid-price from best bid & ask (level 1)
df['mid_price'] = (df[ask_price_cols[0]] + df[bid_price_cols[0]]) / 2.0
df['mid_ret'] = df['mid_price'].pct_change().fillna(0)

# quick checks
print("\nmid_price head:")
print(df['mid_price'].head(5))
print("\nmid_price describe:")
print(df['mid_price'].describe())
print("\nmid_ret describe:")
print(df['mid_ret'].describe())

# show first few columns for visual sanity
display(df.iloc[:5, :12])


Columns sample: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']
Mapped columns (sample):
 unix_ts_col_key: 0
 human_ts_col_key: 1
 bid_price_cols: ['2', '4', '6', '8', '10', '12']
 bid_size_cols : ['3', '5', '7', '9', '11', '13']
 ask_price_cols: ['22', '24', '26', '28', '30', '32']
 ask_size_cols : ['23', '25', '27', '29', '31', '33']

mid_price head:
0    17181.65
1    17181.65
2    17181.65
3    17181.65
4    17181.65
Name: mid_price, dtype: float64

mid_price describe:
count    3.730869e+06
mean     1.986779e+04
std      1.518495e+03
min      1.711835e+04
25%      1.822435e+04
50%      2.078375e+04
75%      2.104865e+04
max      2.165985e+04
Name: mid_price, dtype: float64

mid_ret describe:
count    3.730869e+06
mean     5.910676e-08
std      4.571530e-05
min     -6.898424e-03
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.243061e-02
Name: mid_ret, dtype: float64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1673302660926,2023-01-09 22:17:40,17181.6,23.371,17181.5,0.746,17181.4,5.428,17181.2,0.89,17181.1,3.787
1,1673302661177,2023-01-09 22:17:41,17181.6,24.232,17181.5,0.694,17181.4,5.428,17181.2,0.89,17181.1,3.787
2,1673302661427,2023-01-09 22:17:41,17181.6,24.403,17181.5,0.694,17181.4,5.428,17181.2,0.89,17181.1,3.787
3,1673302661678,2023-01-09 22:17:41,17181.6,24.874,17181.5,0.694,17181.4,5.428,17181.2,0.89,17181.1,3.776
4,1673302661928,2023-01-09 22:17:41,17181.6,24.403,17181.5,0.694,17181.4,5.428,17181.2,0.89,17181.1,3.776


In [12]:
# Cell 3
# recommended feature order: [bid_px1, bid_px2, ..., bid_px10, ask_px1, ask_px2, ..., ask_px10,
#                             bid_vol1, bid_vol2, ..., bid_vol10, ask_vol1, ... ask_vol10]
feature_cols = []

# append bid prices then ask prices (10 each)
feature_cols += bid_price_cols
feature_cols += ask_price_cols
# then bid sizes then ask sizes
feature_cols += bid_size_cols
feature_cols += ask_size_cols

# Keep only first 40 if you want NF=40 (but here there are exactly 40)
feature_cols = feature_cols[:40]
print("Using feature columns (count):", len(feature_cols))
print(feature_cols)


Using feature columns (count): 40
['2', '4', '6', '8', '10', '12', '14', '16', '18', '20', '22', '24', '26', '28', '30', '32', '34', '36', '38', '40', '3', '5', '7', '9', '11', '13', '15', '17', '19', '21', '23', '25', '27', '29', '31', '33', '35', '37', '39', '41']


In [16]:
# Cell 2 — Create labels
HORIZON = 20
RET_THRESHOLD = 1e-4  # 0.01%

def make_labels(df, horizon=HORIZON, threshold=RET_THRESHOLD):
    mp = df['mid_price'].values
    mp_future = np.roll(mp, -horizon)
    labels = np.full(len(df), -1, dtype=int)
    
    ret = (mp_future - mp) / (mp + 1e-12)
    valid = np.arange(len(df) - horizon)

    labels[valid] = np.where(
        ret[valid] > threshold, 2,
        np.where(ret[valid] < -threshold, 0, 1)
    )
    return labels

df['label_3'] = make_labels(df, HORIZON, RET_THRESHOLD)
print("Counts incl -1:", pd.Series(df['label_3']).value_counts())

# Drop unlabeled tail
df = df[df['label_3'] != -1].reset_index(drop=True)
print("Final label distribution:")
print(pd.Series(df['label_3']).value_counts(normalize=True))


Counts incl -1: label_3
 1    2395317
 2     670478
 0     665054
-1         20
Name: count, dtype: int64
Final label distribution:
label_3
1    0.642030
2    0.179712
0    0.178258
Name: proportion, dtype: float64


In [17]:
# Run this once to set params
WINDOW_SIZE = 300    # use 300 unless you want smaller for experiments
BATCH_SIZE = 64
SAMPLE_FOR_SCALER = 2000   # number of windows to sample from train for scaler
HORIZON = 20
RET_THRESHOLD = 1e-4
EPOCHS = 30


In [18]:
# Time-based split using quantiles (no leakage)
train_q = 0.70
val_q = 0.85

train_cut = df['timestamp'].quantile(train_q)
val_cut   = df['timestamp'].quantile(val_q)

df_train = df[df['timestamp'] <= train_cut].reset_index(drop=True)
df_val   = df[(df['timestamp'] > train_cut) & (df['timestamp'] <= val_cut)].reset_index(drop=True)
df_test  = df[df['timestamp'] > val_cut].reset_index(drop=True)

print("Rows (train/val/test):", len(df_train), len(df_val), len(df_test))
# quick label checks (after window offset)
print("Train label counts (head):", pd.Series(df_train['label_3']).value_counts(normalize=True))
print("Val label counts (head):", pd.Series(df_val['label_3']).value_counts(normalize=True))


Rows (train/val/test): 2611594 559627 559628
Train label counts (head): label_3
1    0.640456
2    0.180592
0    0.178952
Name: proportion, dtype: float64
Val label counts (head): label_3
1    0.573298
0    0.214729
2    0.211973
Name: proportion, dtype: float64


In [19]:
from sklearn.preprocessing import StandardScaler
import numpy as np

FEATURE_COLS = bid_price_cols + bid_size_cols + ask_price_cols + ask_size_cols
FEATURE_COLS = FEATURE_COLS[:40]   # ensure NF=40

def df_windows_generator_sample(df_slice, feature_cols, window_size=WINDOW_SIZE):
    n = len(df_slice)
    for end in range(window_size, n):
        start = end - window_size
        X = df_slice.iloc[start:end][feature_cols].values.astype(np.float32)
        yield X

# sample windows
sample_windows = []
gen = df_windows_generator_sample(df_train, FEATURE_COLS, window_size=WINDOW_SIZE)
for i, x in enumerate(gen):
    sample_windows.append(x)
    if i >= SAMPLE_FOR_SCALER - 1:
        break

sample_windows = np.stack(sample_windows)  # shape: (SAMPLE_FOR_SCALER, W, F)
print("sample_windows shape:", sample_windows.shape)

scaler = StandardScaler()
scaler.fit(sample_windows.reshape(-1, sample_windows.shape[-1]))   # fit on time-steps across windows

# save scaler
import joblib
joblib.dump(scaler, "scaler_deeplob.save")
print("Scaler fitted; mean shape:", scaler.mean_.shape)


sample_windows shape: (2000, 300, 40)
Scaler fitted; mean shape: (40,)


In [20]:
import tensorflow as tf

SHUFFLE_BUFFER = 8192
NF = len(FEATURE_COLS)  # 40

# dataset of indices (end positions)
def make_index_ds(df_slice, window_size=WINDOW_SIZE):
    n = len(df_slice)
    if n <= window_size:
        return None
    indices = np.arange(window_size, n, dtype=np.int32)
    return tf.data.Dataset.from_tensor_slices(indices)

# fetch window in python (used by tf.numpy_function)
def fetch_window_by_index(df_slice, feature_cols, idx):
    i = int(idx)
    start = i - WINDOW_SIZE
    win = df_slice.iloc[start:i][feature_cols].values.astype(np.float32)
    lab = int(df_slice.iloc[i]['label_3'])
    return win, np.int32(lab)

# map index -> (x,y)
def map_index_to_window(df_slice):
    def _map(idx):
        x, y = tf.numpy_function(
            func=lambda k: fetch_window_by_index(df_slice, FEATURE_COLS, k),
            inp=[idx],
            Tout=[tf.float32, tf.int32]
        )
        x.set_shape((WINDOW_SIZE, NF))
        y.set_shape(())
        return x, y
    return _map

def make_tf_dataset_from_df_safe(df_slice, shuffle=False):
    ds_idx = make_index_ds(df_slice, WINDOW_SIZE)
    if ds_idx is None:
        raise ValueError("df_slice too small for window_size")
    if shuffle:
        ds_idx = ds_idx.shuffle(SHUFFLE_BUFFER)
    ds = ds_idx.map(map_index_to_window(df_slice), num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return ds

# build raw ds
raw_train_ds = make_tf_dataset_from_df_safe(df_train, shuffle=True)
raw_val_ds   = make_tf_dataset_from_df_safe(df_val, shuffle=False)
raw_test_ds  = make_tf_dataset_from_df_safe(df_test, shuffle=False)

# prepare scaler constants for TF
scaler_mean = tf.constant(scaler.mean_, dtype=tf.float32)   # shape (NF,)
scaler_scale = tf.constant(scaler.scale_, dtype=tf.float32)

# compute class weights (from training labels actually used in windows)
train_labels_for_windows = df_train['label_3'].values[WINDOW_SIZE:]   # labels corresponding to windows
from sklearn.utils.class_weight import compute_class_weight
cw = compute_class_weight('balanced', classes=np.unique(train_labels_for_windows), y=train_labels_for_windows)
# cw maps to classes sorted order — build vector weight_by_label where index=label
# labels are 0,1,2
weight_by_label = np.ones(3, dtype=np.float32)
unique_classes = np.unique(train_labels_for_windows)
for i, lab in enumerate(unique_classes):
    weight_by_label[int(lab)] = float(cw[i])
print("Class weights vector:", weight_by_label)
weight_by_label_tf = tf.constant(weight_by_label, dtype=tf.float32)

# mapping functions: scale, add channel, one-hot labels, sample-weight
def scale_and_prepare(x, y):
    # x shape: (batch, W, NF)
    x = (x - scaler_mean) / scaler_scale   # broadcasts over W
    x = tf.expand_dims(x, axis=-1)         # (batch, W, NF, 1)
    y_onehot = tf.one_hot(y, depth=3)
    sample_w = tf.gather(weight_by_label_tf, y)  # yields shape (batch,)
    return x, y_onehot, sample_w

# build final datasets
train_ds_model = raw_train_ds.map(lambda x,y: scale_and_prepare(x,y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
val_ds_model   = raw_val_ds.map(lambda x,y: scale_and_prepare(x,y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
test_ds_model  = raw_test_ds.map(lambda x,y: scale_and_prepare(x,y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

# quick sanity check: inspect one batch
for xb, yb, wb in train_ds_model.take(1):
    print("x batch shape:", xb.shape, " y shape:", yb.shape, " w shape:", wb.shape)


Class weights vector: [1.8624867 0.5204952 1.845577 ]
x batch shape: (64, 300, 40, 1)  y shape: (64, 3)  w shape: (64,)


In [22]:
# Fixed DeepLOB model: correct reshape (flatten width * channels per time step)
from tensorflow.keras import layers, regularizers, Model, Input
import tensorflow as tf

def focal_loss(gamma=2., alpha=.25):
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        eps = 1e-8
        y_pred = tf.clip_by_value(y_pred, eps, 1. - eps)
        cross = - y_true * tf.math.log(y_pred)
        weight = alpha * tf.pow(1 - y_pred, gamma)
        return tf.reduce_mean(weight * cross)
    return loss

def create_deeplob_reg(T=WINDOW_SIZE, NF=NF, number_of_lstm=64, l2=1e-4, dropout_rate=0.3):
    """
    Corrected reshape: after conv block tensor shape is (batch, T, W, C).
    We want a sequence of length T where each time-step has features W*C.
    """
    input_lmd = Input(shape=(T, NF, 1))
    kreg = regularizers.l2(l2)

    # initial conv stack
    x = layers.Conv2D(32, (1,2), strides=(1,2), kernel_regularizer=kreg)(input_lmd)
    x = layers.BatchNormalization()(x); x = layers.LeakyReLU(0.01)(x)

    x = layers.Conv2D(32, (4,1), padding='same', kernel_regularizer=kreg)(x)
    x = layers.BatchNormalization()(x); x = layers.LeakyReLU(0.01)(x)

    x = layers.Conv2D(32, (4,1), padding='same', kernel_regularizer=kreg)(x)
    x = layers.BatchNormalization()(x); x = layers.LeakyReLU(0.01)(x)

    x = layers.Conv2D(32, (1,2), strides=(1,2), kernel_regularizer=kreg)(x)
    x = layers.BatchNormalization()(x); x = layers.LeakyReLU(0.01)(x)

    # inception-like block
    b1 = layers.Conv2D(64, (1,1), padding='same', kernel_regularizer=kreg)(x)
    b1 = layers.BatchNormalization()(b1); b1 = layers.LeakyReLU(0.01)(b1)
    b1 = layers.Conv2D(64, (3,1), padding='same', kernel_regularizer=kreg)(b1)
    b1 = layers.BatchNormalization()(b1); b1 = layers.LeakyReLU(0.01)(b1)

    b2 = layers.Conv2D(64, (1,1), padding='same', kernel_regularizer=kreg)(x)
    b2 = layers.BatchNormalization()(b2); b2 = layers.LeakyReLU(0.01)(b2)
    b2 = layers.Conv2D(64, (5,1), padding='same', kernel_regularizer=kreg)(b2)
    b2 = layers.BatchNormalization()(b2); b2 = layers.LeakyReLU(0.01)(b2)

    b3 = layers.MaxPooling2D((3,1), strides=(1,1), padding='same')(x)
    b3 = layers.Conv2D(64, (1,1), padding='same', kernel_regularizer=kreg)(b3)
    b3 = layers.BatchNormalization()(b3); b3 = layers.LeakyReLU(0.01)(b3)

    x = layers.concatenate([b1, b2, b3], axis=3)  # shape: (batch, T, W, C)

    # correct reshape: flatten width * channels per time-step
    # x.shape -> (batch, seq_len=T, width=W, channels=C)
    seq_len = int(x.shape[1])   # T
    width   = int(x.shape[2])   # W
    channels = int(x.shape[3])  # C
    feat_dim = width * channels
    x = layers.Reshape((seq_len, feat_dim))(x)    # -> (batch, T, W*C)

    # recurrent block
    x = layers.Dropout(dropout_rate)(x)
    x = layers.LSTM(number_of_lstm, kernel_regularizer=kreg, recurrent_regularizer=kreg)(x)
    x = layers.Dropout(dropout_rate)(x)

    out = layers.Dense(3, activation='softmax', kernel_regularizer=kreg)(x)
    return Model(inputs=input_lmd, outputs=out)

# Recreate and compile model
model = create_deeplob_reg(T=WINDOW_SIZE, NF=NF, number_of_lstm=64)
opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=opt, loss=focal_loss(gamma=2., alpha=.25), metrics=['accuracy'])
model.summary()


In [25]:
# QUICK RUN FOR PROJECT REVIEW - FAST AND SHOWS IMPROVEMENT
FAST_STEPS = 1500        # trains on 1500 batches only (fast)
VAL_STEPS = 300          # evaluate on 300 val batches


callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        "best_deeplob_quick.keras",
        save_best_only=True,
        monitor='val_loss',
        mode='min',
        verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-7,
        verbose=1
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=6,
        restore_best_weights=True,
        verbose=1
    ),
    tf.keras.callbacks.TensorBoard(
        log_dir='./logs/deeplob_quick',
        histogram_freq=1
    )
]


history = model.fit(
    train_ds_model,
    validation_data=val_ds_model,
    epochs=10,
    steps_per_epoch=FAST_STEPS,
    validation_steps=VAL_STEPS,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/10


[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 688ms/step - accuracy: 0.8903 - loss: 0.0709
Epoch 1: val_loss improved from None to 0.09220, saving model to best_deeplob_quick.keras
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1082s[0m 721ms/step - accuracy: 0.9007 - loss: 0.0566 - val_accuracy: 0.5423 - val_loss: 0.0922 - learning_rate: 1.0000e-04
Epoch 2/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 704ms/step - accuracy: 0.9563 - loss: 0.0280
Epoch 2: val_loss improved from 0.09220 to 0.06812, saving model to best_deeplob_quick.keras
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1106s[0m 737ms/step - accuracy: 0.9471 - loss: 0.0246 - val_accuracy: 0.5423 - val_loss: 0.0681 - learning_rate: 1.0000e-04
Epoch 3/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 700ms/step - accuracy: 0.8578 - loss: 0.0236
Epoch 3: val_loss improved from 0.06812 to 0.05419, saving model to best_deeplob_qui

KeyboardInterrupt: 