In [1]:
import sys
sys.path.append("..")

In [2]:
from tqdm import tqdm
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

from utils.helper import *

In [3]:
df = pd.read_csv("../data/Final_Btc_Data_with_Indicators.csv", parse_dates=['Date'])

target_col = "Price"
features = [col for col in df.columns if col != target_col and col != "Date"]

df = scaled_data(df)

seq_len = 15
horizon = 1
X, y = create_sequences(df[features].values, df[target_col].values, seq_len, horizon)

In [4]:
batch_size = 256

train_loader, val_loader, test_loader = data_loaders(df, X, y, batch_size=batch_size)

## CNN-BiLSTM-Attention

In [5]:
from __future__ import annotations
import math
from typing import Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset


# -----------------------------
# Blocks
# -----------------------------
class ConvFeatureExtractor(nn.Module):
    def __init__(
        self,
        in_ch: int,
        channels: Tuple[int, ...] = (64, 128),
        kernels: Tuple[int, ...] = (5, 3),
        strides: Tuple[int, ...] = (1, 1),
        dropout: float = 0.1,
        norm: Optional[str] = "batch",
    ):
        super().__init__()
        assert len(channels) == len(kernels) == len(strides)
        layers = []
        c_in = in_ch
        for c_out, k, s in zip(channels, kernels, strides):
            pad = k // 2  # keep length ~ same
            conv = nn.Conv1d(c_in, c_out, kernel_size=k, stride=s, padding=pad)
            block = [conv]
            if norm == "batch":
                block.append(nn.BatchNorm1d(c_out))
            elif norm == "layer":
                block.append(nn.GroupNorm(1, c_out))
            block += [nn.GELU(), nn.Dropout(dropout)]
            layers.append(nn.Sequential(*block))
            c_in = c_out
        self.net = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, T, F) -> conv expects (B, C, T)
        x = x.transpose(1, 2)
        y = self.net(x)
        # back to (B, T, C)
        return y.transpose(1, 2)

In [6]:
class AdditiveAttention(nn.Module):
    def __init__(self, hidden_dim: int, attn_dim: int = 128, dropout: float = 0.1):
        super().__init__()
        self.W = nn.Linear(hidden_dim, attn_dim, bias=True)
        self.v = nn.Linear(attn_dim, 1, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, H: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
        # H: (B, T, D)
        score = self.v(torch.tanh(self.W(self.dropout(H))))  # (B, T, 1)
        score = score.squeeze(-1)  # (B, T)
        if mask is not None:
            score = score.masked_fill(~mask.bool(), float('-inf'))
        alpha = torch.softmax(score, dim=1)  # (B, T)
        context = torch.bmm(alpha.unsqueeze(1), H).squeeze(1)  # (B, D)
        return context, alpha

In [7]:
class CNNBiLSTMAttention(nn.Module):
    def __init__(
        self,
        n_features: int,
        cnn_channels: Tuple[int, ...] = (64, 128),
        cnn_kernels: Tuple[int, ...] = (5, 3),
        cnn_strides: Tuple[int, ...] = (1, 1),
        cnn_norm: Optional[str] = "batch",
        lstm_hidden: int = 128,
        lstm_layers: int = 2,
        lstm_dropout: float = 0.2,
        attn_dim: int = 128,
        head_hidden: int = 128,
        output_dim: int = 1,
        task: str = "regression",  # or "classification"
        dropout: float = 0.2,
    ):
        super().__init__()
        self.task = task

        self.cnn = ConvFeatureExtractor(
            in_ch=n_features,
            channels=cnn_channels,
            kernels=cnn_kernels,
            strides=cnn_strides,
            dropout=dropout,
            norm=cnn_norm,
        )

        lstm_input_dim = cnn_channels[-1]
        self.lstm = nn.LSTM(
            input_size=lstm_input_dim,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            dropout=(lstm_dropout if lstm_layers > 1 else 0.0),
            bidirectional=True,
            batch_first=True,
        )
        self.attn = AdditiveAttention(hidden_dim=2 * lstm_hidden, attn_dim=attn_dim, dropout=dropout)

        head_out = output_dim if task == "regression" else output_dim  # num_classes
        self.head = nn.Sequential(
            nn.Linear(2 * lstm_hidden, head_hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(head_hidden, head_out),
        )

    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
        # x: (B, T, F)
        feats = self.cnn(x)  # (B, T, C)
        H, _ = self.lstm(feats)  # (B, T, 2H)
        context, alpha = self.attn(H, mask=mask)  # (B, 2H), (B, T)
        out = self.head(context)  # (B, out)
        if self.task == "classification":
            return out, alpha  # logits
        else:
            return out.squeeze(-1), alpha  # regression scalar + weights

In [8]:
# -----------------------------
# Training utilities (regression example)
# -----------------------------
@torch.no_grad()
def evaluate(model, loader, criterion, device="cpu"):
    model.eval()
    total_loss, n = 0.0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        pred, _ = model(xb)
        loss = criterion(pred, yb)
        bs = xb.size(0)
        total_loss += loss.item() * bs
        n += bs
    return total_loss / max(n, 1)

In [9]:
def fit(
    model,
    train_loader,
    val_loader=None,
    epochs: int = 20,
    lr: float = 1e-3,
    weight_decay: float = 1e-4,   # L2 regularization
    decay_type: str = "exponential",  # or "step" or "none"
    decay_rate: float = 0.8,      # gamma for ExponentialLR
    step_size: int = 10,          # for StepLR
    device: str = "cpu",
):
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    if decay_type == "exponential":
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=decay_rate)
    elif decay_type == "step":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=decay_rate)
    else:
        scheduler = None

    best_val, best_state = math.inf, None

    for epoch in range(1, epochs + 1):
        model.train()
        running, n = 0.0, 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad(set_to_none=True)
            pred, _ = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            running += loss.item() * xb.size(0)
            n += xb.size(0)
        train_loss = running / max(n, 1)

        if scheduler is not None:
            scheduler.step()

        if val_loader is not None:
            val_loss = evaluate(model, val_loader, criterion, device)
            improved = val_loss < best_val
            if improved:
                best_val = val_loss
                best_state = {k: v.cpu() for k, v in model.state_dict().items()}
            print(f"Epoch {epoch:03d} | train: {train_loss:.4f} | val: {val_loss:.4f}")
        else:
            print(f"Epoch {epoch:03d} | train: {train_loss:.4f}")

    if best_state is not None:
        model.load_state_dict(best_state)

    return model

In [11]:
if __name__ == "__main__":
    torch.manual_seed(0)

    # ==== Hyperparameters from table ====
    num_conv_layers = 2
    num_filters = 96
    lstm_hidden_dim = 6  # per direction
    lstm_num_layers = 2
    attention_dim = 64
    num_attention_heads = 3  # Not directly used in additive attention; kept for clarity
    activation_fn = "relu"   # Original model uses GELU; changed to ReLU here
    filter_sizes = (3, 3)    # kernel size per conv layer (1D conv uses only first number)
    initial_lr = 0.01
    l2_reg = 0.001
    lr_drop_factor = 0.1
    lr_drop_period = 400
    max_epochs = 100

    # ==== Define model ====
    model = CNNBiLSTMAttention(
        n_features=len(features),
        cnn_channels=(num_filters,) * num_conv_layers,
        cnn_kernels=filter_sizes,
        cnn_strides=(1,) * num_conv_layers,
        cnn_norm="batch",
        lstm_hidden=lstm_hidden_dim,
        lstm_layers=lstm_num_layers,
        lstm_dropout=0.0,
        attn_dim=attention_dim,
        head_hidden=64,   # arbitrary choice, can be tuned
        output_dim=1,
        task="regression",
        dropout=0.0
    )

    # ==== Device ====
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # ==== Train ====
    model = fit(
        model,
        train_loader,
        val_loader,
        epochs=max_epochs,
        lr=initial_lr,
        weight_decay=l2_reg,
        decay_type="step",
        decay_rate=lr_drop_factor,
        step_size=lr_drop_period,
        device=device
    )

    # ==== Test forward pass ====
    with torch.no_grad():
        xb, yb = next(iter(val_loader))
        pred, alpha = model(xb.to(device))
        print("Pred shape:", pred.shape)
        print("Alpha shape:", alpha.shape)


Epoch 001 | train: 1.5618 | val: 23.2569
Epoch 002 | train: 1.2529 | val: 21.2183
Epoch 003 | train: 1.1405 | val: 18.2412
Epoch 004 | train: 1.0610 | val: 16.4690
Epoch 005 | train: 0.9875 | val: 16.4294
Epoch 006 | train: 0.9628 | val: 18.8288
Epoch 007 | train: 0.8566 | val: 15.2853
Epoch 008 | train: 0.9015 | val: 16.4883
Epoch 009 | train: 0.8599 | val: 15.5488
Epoch 010 | train: 0.8158 | val: 15.0208
Epoch 011 | train: 0.8714 | val: 15.0795
Epoch 012 | train: 0.8390 | val: 15.5838
Epoch 013 | train: 0.9402 | val: 18.3223
Epoch 014 | train: 0.8890 | val: 17.8106
Epoch 015 | train: 0.8101 | val: 18.0734
Epoch 016 | train: 0.8060 | val: 14.8987
Epoch 017 | train: 0.7649 | val: 23.5465
Epoch 018 | train: 0.7357 | val: 13.3169
Epoch 019 | train: 0.7054 | val: 12.5696
Epoch 020 | train: 0.6885 | val: 12.6744
Epoch 021 | train: 0.6703 | val: 14.1541
Epoch 022 | train: 0.7001 | val: 14.4655
Epoch 023 | train: 0.7286 | val: 12.8685
Epoch 024 | train: 0.7483 | val: 13.6465
Epoch 025 | trai