In [45]:
# Import libraries and set up config
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import os
import random
# Data pipeline: Load, join, align, feature engineering, targets, normalization, and sequence prep
import pandas as pd
import numpy as np
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import sys
sys.path.append('../src')
from model_def import DeltaSenseTransformer

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Torch version:', torch.__version__)
print('Cuda version:', torch.version.cuda)
print('Using device:', DEVICE)


Torch version: 2.5.1
Cuda version: 11.8
Using device: cuda


In [33]:
TARGET_COL = '1h_ETH-USD_Close'

crypto = pd.read_csv('../data/crypto/1h.csv', index_col=0, parse_dates=True)
context = pd.read_csv('../data/market_context/1h.csv', index_col=0, parse_dates=True)
crypto_daily = pd.read_csv('../data/crypto/1d.csv', index_col=0, parse_dates=True)
context_daily = pd.read_csv('../data/market_context/1d.csv', index_col=0, parse_dates=True)

# --- Align context to crypto (shift context back 30min to match crypto timestamps) ---
context.index = context.index - pd.Timedelta(minutes=30)

# --- Find common date range ---
start_date = max(crypto.index.min(), context.index.min())
end_date = min(crypto.index.max(), context.index.max())
crypto = crypto[(crypto.index >= start_date) & (crypto.index <= end_date)]
context = context[(context.index >= start_date) & (context.index <= end_date)]

# --- Join and sort ---
features = crypto.join(context, how='outer')
features = features.sort_index().ffill()

# drop cols that are all nan
print("Before dropping all-nan columns:", features.shape)
features = features.dropna(axis=1, how='all')
print("after dropping all-nan columns:", features.shape)
features = features.ffill().dropna()
print("after dropping all-nan rows:", features.shape)
print('start date:', features.index.min(), 'end date:', features.index.max())
display(features.tail(2))



Before dropping all-nan columns: (16493, 1742)
after dropping all-nan columns: (16493, 1680)
after dropping all-nan rows: (12509, 1680)
start date: 2024-01-27 13:00:00+00:00 end date: 2025-07-01 17:00:00+00:00
after dropping all-nan rows: (12509, 1680)
start date: 2024-01-27 13:00:00+00:00 end date: 2025-07-01 17:00:00+00:00


Unnamed: 0,1h_BTC-USD_Close,1h_BTC-USD_High,1h_BTC-USD_Low,1h_BTC-USD_Open,1h_BTC-USD_Volume,1h_BTC-USD_Pct_Change_1,1h_BTC-USD_Pct_Change_5,1h_BTC-USD_Pct_Change_10,1h_BTC-USD_SMA_10,1h_BTC-USD_SMA_20,...,1h_VXZ_BB_upper,1h_VXZ_BB_lower,1h_VXZ_RSI,1h_VXZ_OBV,1h_VXZ_ATR,1h_VXZ_MFI,1h_VXZ_Hist_Volatility,1h_VXZ_Donchian_Upper,1h_VXZ_Donchian_Lower,1h_VXZ_Z_Score
2025-07-01 16:00:00+00:00,106075.53125,106282.726562,105926.5625,106027.320312,11422590000.0,0.02994,-0.502636,-0.787414,106495.921875,106812.092969,...,58.537098,57.645352,67.354319,-1413108.0,0.152536,71.730843,0.007874,58.43,57.75,0.824345
2025-07-01 17:00:00+00:00,106219.570312,106315.867188,105958.820312,106114.359375,6050824000.0,0.135789,-0.228747,-0.895195,106399.975781,106764.11875,...,58.537098,57.645352,67.354319,-1413108.0,0.152536,71.730843,0.007874,58.43,57.75,0.824345


In [34]:
# --- Multi-horizon, multi-class targets (6 horizons, 7 classes) ---
def make_targets(prices, horizons=[24, 48, 72, 96, 144, 240], thresholds=[-0.07, -0.03, 0, 0.03, 0.07]):
    # thresholds: [-7%, -3%, -.1%, +.1%, +3%, +7%]
    # classes: 0: < -7%, 1: -7% to -3%, 2: -3% to -1%, 3: -1% to +1%, 4: +1% to +3%, 5: +3% to +7%, 6: > +7%
    targets = np.zeros((len(prices), len(horizons)), dtype=np.int64)
    for h_idx, h in enumerate(horizons):
        future = prices.shift(-h)
        pct = (future - prices) / prices
        targets[:, h_idx] = np.select([
            pct <= thresholds[0],
            (pct > thresholds[0]) & (pct <= thresholds[1]),
            (pct > thresholds[1]) & (pct <= thresholds[2]),
            (pct > thresholds[2]) & (pct <= thresholds[3]),
            (pct > thresholds[3]) & (pct <= thresholds[4]),
            (pct > thresholds[4])
        ], [0, 1, 2, 3, 4, 5], default=3)
    return targets

horizons = [24, 48, 72, 96, 144, 240] # Num hours ahead (1 day, 2 days, 3 days, 4 days, 6 days, 10 days)
targets = make_targets(features[TARGET_COL], horizons=horizons)

# --- Drop rows with NaN targets (due to shifting) ---
valid_idx = ~np.isnan(targets).any(axis=1)
features = features.iloc[valid_idx]
targets = targets[valid_idx]

# --- Normalize features ---
scaler = StandardScaler()
features = scaler.fit_transform(features)
print("Features normalized. Shape:", features.shape)
print("Targets shape:", targets.shape)


Features normalized. Shape: (12509, 1680)
Targets shape: (12509, 6)


In [38]:
# --- Prepare sequences for transformer ---
SEQ_LEN = 240  # 10 days of hourly data (adjust as needed)
class CryptoDataset(Dataset):
    def __init__(self, features, targets, seq_len):
        self.features = features
        self.targets = targets
        self.seq_len = seq_len
    def __len__(self):
        return len(self.features) - self.seq_len
    def __getitem__(self, idx):
        x = self.features[idx:idx+self.seq_len]
        y = self.targets[idx+self.seq_len-1]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.long)

dataset = CryptoDataset(features, targets, SEQ_LEN)
total = len(dataset)
train_end = int(0.7 * total)
val_end = int(0.85 * total)

train_dataset = torch.utils.data.Subset(dataset, range(0, train_end))
val_dataset = torch.utils.data.Subset(dataset, range(train_end, val_end))
test_dataset = torch.utils.data.Subset(dataset, range(val_end, total))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}, Test samples: {len(test_dataset)}")
print(f"Feature shape: {features.shape}, Targets shape: {targets.shape}")

Train samples: 8588, Val samples: 1840, Test samples: 1841
Feature shape: (12509, 1680), Targets shape: (12509, 6)


In [43]:
input_dim = features.shape[1]
seq_len = SEQ_LEN
n_horizons = 6
n_classes = 6

model = DeltaSenseTransformer(
    input_dim=input_dim,
    seq_len=seq_len,
    n_horizons=n_horizons,
    n_classes=n_classes,
    d_model=64,
    nhead=4,
    num_layers=3,
    dim_feedforward=128,
    dropout=0.1
).to(DEVICE)

print(model)


DeltaSenseTransformer(
  (input_proj): Linear(in_features=1680, out_features=64, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (head): Linear(in_features=64, out_features=36, bias=True)
)


In [47]:
import torch.nn.functional as F

def hyperbolic_discount_loss(outputs, targets, K=0.05, horizons=None):
    """
    outputs: (batch, n_horizons, n_classes) - logits
    targets: (batch, n_horizons) - integer class labels
    K: hyperbolic discount factor (Glimcher's K)
    horizons: list/array of horizon steps (e.g. [24, 48, ...])
    Penalizes by class distance and discounts by horizon.
    """
    if horizons is None:
        horizons = torch.arange(outputs.shape[1], device=outputs.device) + 1
    batch, n_horizons, n_classes = outputs.shape
    loss = 0.0
    for h in range(n_horizons):
        # Hyperbolic discount: 1/(1+K*t)
        t = horizons[h] if hasattr(horizons, '__getitem__') else h+1
        discount = 1.0 / (1.0 + K * t)
        logits = outputs[:, h, :]
        target = targets[:, h]
        probs = F.log_softmax(logits, dim=-1)
        # Penalize by class distance
        penalty_matrix = torch.abs(torch.arange(n_classes, device=outputs.device).unsqueeze(0) - target.unsqueeze(1)).float()
        # Cross-entropy for each class, weighted by distance
        ce = -probs.gather(1, target.unsqueeze(1)).squeeze(1)
        weighted_ce = (ce * 1.0 + (probs.exp() * penalty_matrix).sum(dim=1))
        loss += discount * weighted_ce.mean()
    return loss / n_horizons

# Example usage in training loop:
# loss = hyperbolic_discount_loss(out, yb, K=0.05, horizons=[24,48,72,96,144,240])


In [None]:
# --- Training loop with early stopping ---
EPOCHS = 300
PATIENCE = 5
best_val_loss = float('inf')
early_stop_counter = 0
train_losses, val_losses = [], []
best_model_path = f'../notebooks/best_{TARGET_COL}_transformer.pth'

# Use the custom hyperbolic discounting loss
def get_horizons():
    return [24, 48, 72, 96, 144, 240]

optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        out = model(xb)
        loss = hyperbolic_discount_loss(out, yb, K=0.05, horizons=get_horizons())
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)
    train_loss = running_loss / len(train_loader.dataset)
    train_losses.append(train_loss)

    # Validation
    model.eval()
    val_running_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            out = model(xb)
            loss = hyperbolic_discount_loss(out, yb, K=0.05, horizons=get_horizons())
            val_running_loss += loss.item() * xb.size(0)
    val_loss = val_running_loss / len(val_loader.dataset)
    val_losses.append(val_loss)

    print(f"Epoch {epoch+1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), best_model_path)
    else:
        early_stop_counter += 1
        if early_stop_counter >= PATIENCE and epoch >= 150:  # Start checking after 10 epochs
            print(f"Early stopping at epoch {epoch+1}")
            break

# --- Plot train/val losses ---
# plt.figure(figsize=(8,5))
# plt.plot(train_losses, label='Train Loss')
# plt.plot(val_losses, label='Val Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.title('Training and Validation Loss')
# plt.show()

# --- Confusion matrix for each horizon ---
model.load_state_dict(torch.load(best_model_path))
model.eval()
all_preds = []
all_targets = []
with torch.no_grad():
    for xb, yb in val_loader:
        xb = xb.to(DEVICE)
        out = model(xb)  # (batch, n_horizons, n_classes)
        preds = out.argmax(-1).cpu().numpy()
        all_preds.append(preds)
        all_targets.append(yb.numpy())
all_preds = np.concatenate(all_preds, axis=0)
all_targets = np.concatenate(all_targets, axis=0)

# class_names = ['<-7%', '-7%~-3%', '-3%~-1%', '-1%~+1%', '+1%~+3%', '+3%~+7%', '>+7%']
# for h in range(n_horizons):
#     cm = confusion_matrix(all_targets[:,h], all_preds[:,h], labels=list(range(7)))
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
#     disp.plot(cmap='Blues')
#     plt.title(f'Confusion Matrix - Horizon {h+1}')
#     plt.show()


Epoch 1: train_loss=0.6153, val_loss=0.8514
Epoch 2: train_loss=0.6318, val_loss=0.8362
Epoch 3: train_loss=0.6270, val_loss=0.8002
Epoch 4: train_loss=0.6169, val_loss=0.7754
Epoch 5: train_loss=0.6104, val_loss=0.7600
Epoch 6: train_loss=0.6087, val_loss=0.8348
Epoch 7: train_loss=0.6072, val_loss=0.7429
Epoch 8: train_loss=0.5980, val_loss=0.7534
Epoch 9: train_loss=0.5934, val_loss=0.7950
Epoch 10: train_loss=0.5955, val_loss=0.7625
