In [3]:
# ============================================
# WaveNet-CNN for Bitcoin LOB — FULL PIPELINE
# ============================================

# --- Imports & config ---
import os, re, glob, math, datetime, random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as L
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Reproducibility (as much as TF allows)
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

# --- Paths (edit if needed) ---
BASE_DIR = r"E:\DL Project"         # your project root
DATA_CSV = r"E:\DL Project\data\lob.csv"
ARTIFACTS_DIR = os.path.join(BASE_DIR, "artifacts_wavenet")
LOGS_DIR      = os.path.join(BASE_DIR, "logs", "wavenet")
os.makedirs(ARTIFACTS_DIR, exist_ok=True); os.makedirs(LOGS_DIR, exist_ok=True)

# --- Data / Model hyperparams ---
T = 300                 # timesteps per sample
F = 40                  # features per timestep (we select 40 LOB columns below)
NUM_CLASSES = 3
BATCH_SIZE = 20
EPOCHS = 20
LR = 1e-4

# ================
# 1) Load & label
# ================
print("Loading CSV...")
df = pd.read_csv(DATA_CSV, index_col='Unnamed: 0', parse_dates=True)
df.columns = np.arange(42)
df = df.drop_duplicates(subset=1)      # drop duplicate timestamps

# Select 40 LOB features (10 levels bid/ask {price, size})
feature_cols = [2,3,22,23,4,5,24,25,6,7,26,27,8,9,28,29,10,11,30,31,
                12,13,32,33,14,15,34,35,16,17,36,37,18,19,38,39,20,21,40,41]
data = df.loc[:, feature_cols].copy()
data.index = pd.to_datetime(df[1])     # use exchange timestamp

# Midprice
midprice = pd.DataFrame((df.iloc[:,2] + df.iloc[:,22]) / 2, columns=['Price'])
midprice.index = pd.to_datetime(df[1])

# DeepLOB-style labeling
def make_labels(mp: pd.DataFrame, k=10, alpha=1e-5, mode=1):
    x = mp.copy()
    x["MeanNeg"] = x['Price'].rolling(window=k).mean()
    x["MeanPos"] = x["MeanNeg"].shift(-(k-1))
    if mode == 1:
        x["s"] = (x["MeanPos"] - x['Price']) / x['Price']
    else:
        x["s"] = (x["MeanPos"] - x["MeanNeg"]) / x["MeanNeg"]
    x[k] = 0
    x.loc[x["s"] < -alpha, k] = -1
    x.loc[x["s"] >  alpha, k] =  1
    x = x.dropna()
    return x

label_df = make_labels(midprice, k=10, alpha=1e-5, mode=1)
# Align features to labeled index
data = data.loc[label_df.index].copy()
y_raw = label_df.loc[data.index, 10]   # values in {-1,0,1}

# ===============================
# 2) Split by time (no leakage)
# ===============================
n = len(data)
i_train = int(0.60*n)
i_val   = int(0.75*n)

data_train = data.iloc[:i_train].copy()
data_val   = data.iloc[i_train:i_val].copy()
data_test  = data.iloc[i_val:].copy()

y_train_raw = y_raw.iloc[:i_train]
y_val_raw   = y_raw.iloc[i_train:i_val]
y_test_raw  = y_raw.iloc[i_val:]

print("Sizes -> train/val/test:", len(data_train), len(data_val), len(data_test))

# ==========================================
# 3) Scale using TRAIN stats (no leakage)
# ==========================================
scaler = StandardScaler()
data_train_np = scaler.fit_transform(data_train.values)
data_val_np   = scaler.transform(data_val.values)
data_test_np  = scaler.transform(data_test.values)

data_train = pd.DataFrame(data_train_np, index=data_train.index, columns=data_train.columns)
data_val   = pd.DataFrame(data_val_np,   index=data_val.index,   columns=data_val.columns)
data_test  = pd.DataFrame(data_test_np,  index=data_test.index,  columns=data_test.columns)

# ================================
# 4) Labels -> one-hot (0/1/2)
# ================================
map_dict = {-1:0, 0:1, 1:2}
y_train_idx = y_train_raw.map(map_dict).astype(int).values
y_val_idx   = y_val_raw.map(map_dict).astype(int).values
y_test_idx  = y_test_raw.map(map_dict).astype(int).values

y_train = keras.utils.to_categorical(y_train_idx, num_classes=NUM_CLASSES)
y_val   = keras.utils.to_categorical(y_val_idx,   num_classes=NUM_CLASSES)
y_test  = keras.utils.to_categorical(y_test_idx,  num_classes=NUM_CLASSES)

# Class weights for imbalance
classes = np.array([0,1,2])
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_idx)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
print("Class weights:", class_weight)

# ===================================
# 5) DataLoader (Sequence generator)
# ===================================
class LOBSequence(keras.utils.Sequence):
    """
    Returns (X, y) with X: (batch, T, F) and y: (batch, 3).
    Iterates sequentially; no shuffling to preserve time order.
    """
    def __init__(self, X_df, Y_oh, window_size, num_features, batch_size):
        self.X = X_df.reset_index(drop=True)   # pandas -> contiguous rows
        self.Y = Y_oh
        self.T = window_size
        self.F = num_features
        self.B = batch_size

    def __len__(self):
        return max(0, math.floor((len(self.X) - self.T) / self.B))

    def __getitem__(self, idx):
        dataX, dataY = [], []
        i = idx + self.T
        for _ in range(self.B):
            x_win = self.X.iloc[i-self.T:i].values         # (T,F)
            y_one = self.Y[i]                               # (3,)
            dataX.append(x_win)
            dataY.append(y_one)
            i += 1
        return np.asarray(dataX, np.float32), np.asarray(dataY, np.float32)

TrainBatch      = LOBSequence(data_train, y_train, T, F, BATCH_SIZE)
ValidationBatch = LOBSequence(data_val,   y_val,   T, F, BATCH_SIZE)
TestBatch       = LOBSequence(data_test,  y_test,  T, F, BATCH_SIZE)

# ===========================
# 6) WaveNet-CNN architecture
# ===========================
def build_wavenet(timesteps=300, features=40, filters=64, dilation_stack=(1,2,4,8,16,32)):
    inp = L.Input(shape=(timesteps, features))
    x = inp
    for d in dilation_stack:
        res = x
        x = L.Conv1D(filters, 3, padding="causal", dilation_rate=d, activation="relu")(x)
        x = L.Conv1D(filters, 3, padding="causal", dilation_rate=d, activation="relu")(x)
        if res.shape[-1] != x.shape[-1]:
            res = L.Conv1D(filters, 1, padding="same")(res)
        x = L.Add()([x, res])         # residual block
        x = L.BatchNormalization()(x)  # stabilize
    x = L.GlobalAveragePooling1D()(x)
    out = L.Dense(NUM_CLASSES, activation="softmax")(x)
    model = keras.Model(inp, out, name="WaveNetCNN")
    model.compile(optimizer=keras.optimizers.Adam(LR),
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    return model

model = build_wavenet(T, F, filters=64, dilation_stack=(1,2,4,8,16,32))
model.summary()

# ===============================
# 7) Callbacks (checkpoints/logs)
# ===============================
stamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
CKPT_DIR = os.path.join(ARTIFACTS_DIR, "wavenet")
os.makedirs(CKPT_DIR, exist_ok=True)

BEST_MODEL_PATH  = os.path.join(CKPT_DIR, "best_model.keras")
EPOCH_MODEL_PATH = os.path.join(CKPT_DIR, "epoch_{epoch:02d}-val_{val_loss:.4f}.keras")

LOG_RUN_DIR = os.path.join(LOGS_DIR, stamp)
os.makedirs(LOG_RUN_DIR, exist_ok=True)
CSV_PATH = os.path.join(LOG_RUN_DIR, "metrics.csv")

ckpt_best = keras.callbacks.ModelCheckpoint(
    filepath=BEST_MODEL_PATH,
    save_weights_only=False, monitor="val_loss", mode="min",
    save_best_only=True, verbose=1
)
ckpt_epoch = keras.callbacks.ModelCheckpoint(
    filepath=EPOCH_MODEL_PATH,
    save_weights_only=False, monitor="val_loss", mode="min",
    save_best_only=False, verbose=0
)
early = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True, verbose=1
)
reduce = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6, verbose=1
)
tb = keras.callbacks.TensorBoard(log_dir=LOG_RUN_DIR, histogram_freq=1)
csv = keras.callbacks.CSVLogger(CSV_PATH, append=True)

callbacks = [ckpt_best, ckpt_epoch, early, reduce, tb, csv]

# ==================
# 8) Train
# ==================
history = model.fit(
    TrainBatch,
    validation_data=ValidationBatch,
    epochs=10,
    callbacks=callbacks,
    class_weight=class_weight,     # handles class imbalance
    verbose=1
)

# ==================
# 9) Evaluate
# ==================
print("\nEvaluating on Test set...")
test_loss, test_acc = model.evaluate(TestBatch, verbose=1)
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")

# Detailed metrics
def collect_xy(seq):
    ys = []
    for i in range(len(seq)):
        _, y = seq[i]
        ys.append(y)
    return np.concatenate(ys, axis=0)

y_true = collect_xy(TestBatch)                     # (N,3)
y_true_cls = y_true.argmax(axis=1)

y_pred = model.predict(TestBatch, verbose=0)       # (N,3)
y_pred_cls = y_pred.argmax(axis=1)

print("Balanced Acc:", balanced_accuracy_score(y_true_cls, y_pred_cls))
print("Macro F1    :", f1_score(y_true_cls, y_pred_cls, average='macro'))
print(classification_report(y_true_cls, y_pred_cls, digits=4))

cm = confusion_matrix(y_true_cls, y_pred_cls, labels=[0,1,2])
ConfusionMatrixDisplay(cm, display_labels=["Down","Neutral","Up"]).plot(values_format='d')
plt.title("Confusion Matrix — Test"); plt.show()

# ==========================
# 10) Plot curves (from CSV)
# ==========================
def plot_from_csv(csv_path, title_prefix="WaveNet"):
    if not os.path.exists(csv_path):
        print("CSV not found:", csv_path); return
    dfm = pd.read_csv(csv_path)
    if 'epoch' not in dfm.columns:
        dfm.insert(0, 'epoch', np.arange(len(dfm)))
    # Accuracy
    plt.figure(figsize=(8,5))
    if 'accuracy' in dfm.columns: plt.plot(dfm['epoch'], dfm['accuracy'], label='Train Acc')
    if 'val_accuracy' in dfm.columns: plt.plot(dfm['epoch'], dfm['val_accuracy'], label='Val Acc')
    plt.title(f'{title_prefix} Accuracy per Epoch'); plt.xlabel('Epoch'); plt.ylabel('Accuracy')
    plt.legend(); plt.grid(True); plt.show()
    # Loss
    plt.figure(figsize=(8,5))
    if 'loss' in dfm.columns: plt.plot(dfm['epoch'], dfm['loss'], label='Train Loss')
    if 'val_loss' in dfm.columns: plt.plot(dfm['epoch'], dfm['val_loss'], label='Val Loss')
    plt.title(f'{title_prefix} Loss per Epoch'); plt.xlabel('Epoch'); plt.ylabel('Loss')
    plt.legend(); plt.grid(True); plt.show()

plot_from_csv(CSV_PATH, "WaveNet")

# ==========================
# 11) Resume later (example)
# ==========================
# To resume training in a new session:
# model = keras.models.load_model(BEST_MODEL_PATH)
# model.fit(TrainBatch, validation_data=ValidationBatch, epochs=EPOCHS, callbacks=callbacks,
#           class_weight=class_weight, verbose=1)


Loading CSV...


  df = pd.read_csv(DATA_CSV, index_col='Unnamed: 0', parse_dates=True)


Sizes -> train/val/test: 561108 140277 233795
Class weights: {0: 0.9606467452156673, 1: 1.1049500505107195, 2: 0.9487518958704264}


Epoch 1/10


  self._warn_if_super_not_called()


[1m28039/28040[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 38ms/step - accuracy: 0.6373 - loss: 0.8851
Epoch 1: val_loss improved from None to 125.94903, saving model to E:\DL Project\artifacts_wavenet\wavenet\best_model.keras
[1m28040/28040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1166s[0m 41ms/step - accuracy: 0.6724 - loss: 0.7965 - val_accuracy: 0.4051 - val_loss: 125.9490 - learning_rate: 1.0000e-04
Epoch 2/10
[1m28040/28040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.8135 - loss: 0.5267
Epoch 2: val_loss did not improve from 125.94903
[1m28040/28040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1096s[0m 39ms/step - accuracy: 0.8408 - loss: 0.4698 - val_accuracy: 0.4051 - val_loss: 276.5611 - learning_rate: 1.0000e-04
Epoch 3/10
[1m28040/28040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.8869 - loss: 0.3646
Epoch 3: val_loss did not improve from 125.94903

Epoch 3: ReduceLROnPlateau reducing

KeyboardInterrupt: 