# TimesNet Medium Configuration - Financial Data Training

This notebook contains a **medium TimesNet configuration** optimized for:
- Balanced performance and training time
- Production-ready experimentation
- Good accuracy with reasonable compute requirements
- Standard research benchmarking

**Dataset**: Financial time series with 4 targets + 114 covariates (118 total features)
**Training Time**: ~15-25 minutes per epoch

In [None]:
# Import required libraries
import os
import sys
import time
import torch
import numpy as np
import pandas as pd
from datetime import datetime

# Add project root to path
sys.path.append(os.path.dirname(os.path.abspath('.')))

from models.TimesNet import Model as TimesNet
from utils.tools import EarlyStopping, adjust_learning_rate
from utils.metrics import metric
from utils.logger import logger
from data_provider.data_loader import Dataset_Custom
from torch.utils.data import DataLoader

print("✅ All imports successful")
print(f"🔥 PyTorch version: {torch.__version__}")
print(f"💻 Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

## ⚖️ Medium Configuration Parameters

**Purpose**: Balanced performance and efficiency for production use

In [None]:
# ================================
# MEDIUM CONFIGURATION - TIMESNET
# ================================

class MediumConfig:
    # === DATA CONFIGURATION ===
    data = 'custom'                    # Dataset type (custom for prepared financial data)
    root_path = './data/'              # Root directory for data files
    data_path = 'prepared_financial_data.csv'  # Main data file
    features = 'M'                     # Forecasting mode: 'M'=Multivariate, 'S'=Univariate, 'MS'=Multivariate-to-Univariate
    target = 'log_Close'               # Primary target column (for 'S' mode)
    freq = 'b'                         # Time frequency: 'b'=business day, 'h'=hourly, 'd'=daily
    
    # === SEQUENCE PARAMETERS ===
    seq_len = 100                      # Input sequence length (lookback window) - MEDIUM: balanced context
    label_len = 20                     # Start token length for decoder input (overlap with seq_len)
    pred_len = 10                      # Prediction horizon (how many steps to forecast) - MEDIUM: reasonable prediction
    
    # === TRAIN/VAL/TEST SPLITS ===
    val_len = 20                       # Validation set length in time steps
    test_len = 20                      # Test set length in time steps
    prod_len = 10                      # Production forecast length (future predictions beyond data)
    
    # === TIMESNET MODEL ARCHITECTURE ===
    # Core dimensions
    enc_in = 118                       # Encoder input size (total features: 4 targets + 114 covariates)
    dec_in = 118                       # Decoder input size (usually same as enc_in)
    c_out = 118                        # Output size (must match enc_in to avoid dimension mismatch)
    d_model = 64                       # Model dimension (embedding size) - MEDIUM: good capacity
    d_ff = 128                         # Feed-forward network dimension - MEDIUM: standard 2x d_model
    
    # Attention mechanism
    n_heads = 8                        # Number of attention heads - MEDIUM: standard attention
    e_layers = 3                       # Number of encoder layers - MEDIUM: good depth
    d_layers = 1                       # Number of decoder layers (usually 1 for forecasting)
    
    # TimesNet specific parameters
    top_k = 5                          # Top-k frequencies for TimesNet decomposition - MEDIUM: more pattern capture
    num_kernels = 6                    # Number of convolution kernels in Inception blocks - MEDIUM: good feature extraction
    
    # Regularization
    dropout = 0.1                      # Dropout rate for regularization
    
    # Additional model settings
    embed = 'timeF'                    # Time feature embedding: 'timeF'=time features, 'fixed'=learnable, 'learned'=learned
    activation = 'gelu'                # Activation function: 'gelu', 'relu', 'swish'
    factor = 1                         # Attention factor (usually 1)
    distil = True                      # Whether to use knowledge distillation
    moving_avg = 25                    # Moving average window for trend decomposition
    output_attention = False           # Whether to output attention weights (set True for interpretability)
    
    # === TRAINING CONFIGURATION ===
    train_epochs = 20                  # Number of training epochs - MEDIUM: enough for convergence
    batch_size = 32                    # Batch size - MEDIUM: good balance memory/speed
    learning_rate = 0.0001             # Learning rate - MEDIUM: conservative for stability
    patience = 7                       # Early stopping patience - MEDIUM: moderate patience
    lradj = 'type1'                    # Learning rate adjustment strategy
    
    # Loss and optimization
    loss = 'MSE'                       # Loss function: 'MSE', 'MAE', 'Huber'
    use_amp = False                    # Automatic mixed precision (can speed up training)
    
    # System settings
    num_workers = 6                    # DataLoader workers - MEDIUM: more parallel processing
    seed = 2024                        # Random seed for reproducibility
    
    # Task specific
    task_name = 'short_term_forecast'  # Task type: 'short_term_forecast' for financial prediction
    
    # Experiment tracking
    des = 'medium_config'              # Experiment description
    checkpoints = f'./checkpoints/TimesNet_medium_{datetime.now().strftime("%Y%m%d_%H%M")}'
    
# Create config instance
args = MediumConfig()

print("⚖️ Medium Configuration Loaded:")
print(f"   📏 Sequence Length: {args.seq_len}")
print(f"   🎯 Prediction Length: {args.pred_len}")
print(f"   🧠 Model Dimension: {args.d_model}")
print(f"   ⚡ Epochs: {args.train_epochs}")
print(f"   📊 Batch Size: {args.batch_size}")

## 🎛️ Tweakable Parameters

Modify these parameters to experiment with different configurations:

In [None]:
# ================================
# TWEAKABLE PARAMETERS - EXPERIMENT
# ================================

# Modify these for experiments:

# --- Sequence parameters (affect model complexity and data usage) ---
args.seq_len = 100         # Try: 50, 100, 200 (longer = more context, slower training)
args.pred_len = 10         # Try: 5, 10, 20 (longer = harder prediction task)
args.label_len = 20        # Try: seq_len//5 to seq_len//2 (decoder start overlap)

# --- Model size (affect memory usage and training time) ---
args.d_model = 64          # Try: 32, 64, 128 (larger = more capacity, slower)
args.d_ff = 128            # Try: 64, 128, 256 (usually 2x d_model)
args.n_heads = 8           # Try: 4, 8, 16 (must divide d_model evenly)
args.e_layers = 3          # Try: 2, 3, 4 (more layers = deeper model)

# --- TimesNet specific (affect pattern recognition capability) ---
args.top_k = 5             # Try: 3, 5, 8 (more frequencies = more complex patterns)
args.num_kernels = 6       # Try: 4, 6, 8 (more kernels = more feature extraction)
args.moving_avg = 25       # Try: 15, 25, 50 (window for trend decomposition)

# --- Training parameters ---
args.train_epochs = 20     # Try: 15, 20, 30
args.batch_size = 32       # Try: 16, 32, 64 (larger = faster but more memory)
args.learning_rate = 0.0001 # Try: 0.00005, 0.0001, 0.0005
args.patience = 7          # Try: 5, 7, 10 (early stopping patience)

# --- Advanced tweaks ---
args.dropout = 0.1         # Try: 0.0, 0.1, 0.2 (higher = more regularization)
args.factor = 1            # Try: 1, 3, 5 (attention sparsity factor)
args.distil = True         # Try: True, False (knowledge distillation)
args.activation = 'gelu'   # Try: 'gelu', 'relu', 'swish'

# --- Loss function experiments ---
args.loss = 'MSE'          # Try: 'MSE', 'MAE' (different loss characteristics)
args.use_amp = False       # Try: True for faster training (if supported)

print(f"✏️ Updated Medium Configuration:")
print(f"   Model Size: d_model={args.d_model}, d_ff={args.d_ff}, heads={args.n_heads}, layers={args.e_layers}")
print(f"   TimesNet: top_k={args.top_k}, kernels={args.num_kernels}, moving_avg={args.moving_avg}")
print(f"   Training: epochs={args.train_epochs}, batch={args.batch_size}, lr={args.learning_rate}")
print(f"   Advanced: dropout={args.dropout}, factor={args.factor}, loss={args.loss}")

## 🚀 Training Setup and Execution

In [None]:
# Setup device and create checkpoint directory
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.makedirs(args.checkpoints, exist_ok=True)

print(f"🔥 Using device: {device}")
print(f"📁 Checkpoints: {args.checkpoints}")

# Data loader setup
def create_data_loader(flag):
    args.validation_length = args.val_len
    args.test_length = args.test_len
    
    dataset = Dataset_Custom(
        args=args,
        root_path=args.root_path,
        data_path=args.data_path,
        flag=flag,
        size=[args.seq_len, args.label_len, args.pred_len],
        features=args.features,
        target=args.target,
        scale=True,
        timeenc=1 if args.embed == 'timeF' else 0,
        freq=args.freq
    )
    
    shuffle = (flag == 'train')
    data_loader = DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=shuffle,
        num_workers=args.num_workers,
        drop_last=True
    )
    return data_loader

# Create data loaders
train_loader = create_data_loader('train')
val_loader = create_data_loader('val')
test_loader = create_data_loader('test')

print(f"📊 Data loaders created:")
print(f"   Train: {len(train_loader)} batches")
print(f"   Val: {len(val_loader)} batches")
print(f"   Test: {len(test_loader)} batches")

In [None]:
# Initialize TimesNet model
model = TimesNet(args).to(device)

# Setup training components
if args.loss == 'MSE':
    criterion = torch.nn.MSELoss()
elif args.loss == 'MAE':
    criterion = torch.nn.L1Loss()
else:
    criterion = torch.nn.MSELoss()  # Default fallback

optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
early_stopping = EarlyStopping(patience=args.patience, verbose=True)

# Model info
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"🧠 TimesNet Medium Model Initialized:")
print(f"   📊 Total Parameters: {total_params:,}")
print(f"   🎯 Trainable Parameters: {trainable_params:,}")
print(f"   💾 Model Size: ~{total_params * 4 / 1024 / 1024:.1f} MB")
print(f"   📈 Loss Function: {args.loss}")

In [None]:
# Training function with detailed progress tracking
def train_epoch():
    model.train()
    total_loss = 0.0
    num_batches = len(train_loader)
    
    epoch_start_time = time.time()
    print(f"🏃 Training on {num_batches} batches...")
    
    for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
        # Move to device
        batch_x = batch_x.float().to(device)
        batch_y = batch_y.float().to(device)
        batch_x_mark = batch_x_mark.float().to(device)
        batch_y_mark = batch_y_mark.float().to(device)
        
        # Prepare decoder input
        dec_inp = torch.zeros_like(batch_y[:, -args.pred_len:, :]).float().to(device)
        dec_inp = torch.cat([batch_y[:, :args.label_len, :], dec_inp], dim=1).float().to(device)
        
        # Forward pass
        optimizer.zero_grad()
        
        if args.use_amp:
            with torch.cuda.amp.autocast():
                outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
                target_outputs = outputs[:, -args.pred_len:, :4]
                target_y = batch_y[:, -args.pred_len:, :4]
                loss = criterion(target_outputs, target_y)
        else:
            outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
            target_outputs = outputs[:, -args.pred_len:, :4]
            target_y = batch_y[:, -args.pred_len:, :4]
            loss = criterion(target_outputs, target_y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Progress reporting (every 15 batches for medium config)
        if i % 15 == 0 or i == num_batches - 1:
            progress_pct = (i + 1) / num_batches * 100
            avg_loss = total_loss / (i + 1)
            elapsed = time.time() - epoch_start_time
            remaining = elapsed / (i + 1) * (num_batches - i - 1)
            print(f"   Batch {i+1:3d}/{num_batches} ({progress_pct:5.1f}%) - "
                  f"Loss: {loss.item():.6f} (Avg: {avg_loss:.6f}) - "
                  f"Remaining: {remaining:.1f}s")
    
    epoch_time = time.time() - epoch_start_time
    avg_loss = total_loss / num_batches
    print(f"✅ Epoch completed in {epoch_time:.1f}s. Average loss: {avg_loss:.6f}")
    return avg_loss

# Validation function
def validate_epoch():
    model.eval()
    total_loss = 0.0
    num_batches = 0
    
    with torch.no_grad():
        for batch_x, batch_y, batch_x_mark, batch_y_mark in val_loader:
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float().to(device)
            batch_x_mark = batch_x_mark.float().to(device)
            batch_y_mark = batch_y_mark.float().to(device)
            
            dec_inp = torch.zeros_like(batch_y[:, -args.pred_len:, :]).float().to(device)
            dec_inp = torch.cat([batch_y[:, :args.label_len, :], dec_inp], dim=1).float().to(device)
            
            outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
            
            target_outputs = outputs[:, -args.pred_len:, :4]
            target_y = batch_y[:, -args.pred_len:, :4]
            loss = criterion(target_outputs, target_y)
            
            total_loss += loss.item()
            num_batches += 1
    
    avg_loss = total_loss / num_batches if num_batches > 0 else float('inf')
    return avg_loss

print("🔧 Training functions defined")

In [None]:
# Main training loop
print(f"🚀 Starting TimesNet Medium Training ({args.train_epochs} epochs)")
print(f"⏰ Estimated time: ~{args.train_epochs * 20} minutes")
print("="*60)

best_val_loss = float('inf')
train_losses = []
val_losses = []

training_start_time = time.time()

for epoch in range(args.train_epochs):
    print(f"\n🔄 Epoch {epoch+1}/{args.train_epochs}")
    
    # Train
    train_loss = train_epoch()
    train_losses.append(train_loss)
    
    # Validate
    print("🔍 Running validation...")
    val_loss = validate_epoch()
    val_losses.append(val_loss)
    
    # Log progress with trend analysis
    trend_emoji = "📈" if len(train_losses) > 1 and train_loss > train_losses[-2] else "📉"
    print(f"{trend_emoji} Epoch {epoch+1} Results: Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
    
    # Learning rate adjustment
    adjust_learning_rate(optimizer, epoch + 1, args)
    current_lr = optimizer.param_groups[0]['lr']
    print(f"🎛️ Learning rate: {current_lr:.6f}")
    
    # Early stopping
    early_stopping(val_loss, model, args.checkpoints)
    if early_stopping.early_stop:
        print("⏹️ Early stopping triggered")
        break
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f"{args.checkpoints}/best_model.pth")
        print(f"💾 New best model saved (Val Loss: {val_loss:.6f})")
    
    # Progress summary every 5 epochs
    if (epoch + 1) % 5 == 0:
        elapsed_time = time.time() - training_start_time
        remaining_epochs = args.train_epochs - (epoch + 1)
        estimated_remaining = elapsed_time / (epoch + 1) * remaining_epochs
        print(f"⏱️ Progress: {epoch+1}/{args.train_epochs} epochs ({elapsed_time/60:.1f}m elapsed, ~{estimated_remaining/60:.1f}m remaining)")

total_training_time = time.time() - training_start_time
print(f"\n🎉 Training completed in {total_training_time/60:.1f} minutes!")
print(f"🏆 Best validation loss: {best_val_loss:.6f}")

## 📊 Results and Analysis

In [None]:
# Load best model and test
model.load_state_dict(torch.load(f"{args.checkpoints}/best_model.pth", weights_only=False))
model.eval()

# Test evaluation
preds = []
trues = []

print("🧪 Testing model...")
with torch.no_grad():
    for batch_x, batch_y, batch_x_mark, batch_y_mark in test_loader:
        batch_x = batch_x.float().to(device)
        batch_y = batch_y.float().to(device)
        batch_x_mark = batch_x_mark.float().to(device)
        batch_y_mark = batch_y_mark.float().to(device)
        
        dec_inp = torch.zeros_like(batch_y[:, -args.pred_len:, :]).float().to(device)
        dec_inp = torch.cat([batch_y[:, :args.label_len, :], dec_inp], dim=1).float().to(device)
        
        outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
        
        pred = outputs[:, -args.pred_len:, :4].detach().cpu().numpy()
        true = batch_y[:, -args.pred_len:, :4].detach().cpu().numpy()
        
        preds.append(pred)
        trues.append(true)

# Calculate metrics
if preds:
    preds = np.concatenate(preds, axis=0)
    trues = np.concatenate(trues, axis=0)
    
    mae, mse, rmse, mape, mspe = metric(preds, trues)
    
    print("\n📊 TimesNet Medium - Test Results:")
    print(f"   🎯 MSE:  {mse:.6f}")
    print(f"   📏 MAE:  {mae:.6f}")
    print(f"   📐 RMSE: {rmse:.6f}")
    print(f"   📈 MAPE: {mape:.6f}%")
    print(f"   📉 MSPE: {mspe:.6f}%")
    
    # Training curve analysis
    if len(train_losses) > 1:
        final_improvement = (train_losses[0] - train_losses[-1]) / train_losses[0] * 100
        val_improvement = (val_losses[0] - val_losses[-1]) / val_losses[0] * 100
        print(f"\n📈 Training Analysis:")
        print(f"   📊 Training improvement: {final_improvement:.1f}%")
        print(f"   📊 Validation improvement: {val_improvement:.1f}%")
        print(f"   🎯 Final train/val ratio: {train_losses[-1]/val_losses[-1]:.3f}")
    
    # Summary
    print(f"\n📋 Configuration Summary:")
    print(f"   ⚖️ Model: Medium ({total_params:,} params)")
    print(f"   📏 Sequence: {args.seq_len} → {args.pred_len}")
    print(f"   🧠 Architecture: d_model={args.d_model}, layers={args.e_layers}, heads={args.n_heads}")
    print(f"   🔧 TimesNet: top_k={args.top_k}, kernels={args.num_kernels}")
    print(f"   ⏱️ Training time: {total_training_time/60:.1f} minutes")
    print(f"   🏆 Final performance: RMSE={rmse:.6f}")
else:
    print("⚠️ No test data available")

## 🔍 Model Insights (Optional Analysis)

In [None]:
# Optional: Analyze model attention (if output_attention=True)
if args.output_attention:
    print("🔍 Analyzing attention patterns...")
    
    # Get a sample batch for attention analysis
    sample_batch = next(iter(test_loader))
    batch_x, batch_y, batch_x_mark, batch_y_mark = sample_batch
    
    batch_x = batch_x.float().to(device)
    batch_y = batch_y.float().to(device)
    batch_x_mark = batch_x_mark.float().to(device)
    batch_y_mark = batch_y_mark.float().to(device)
    
    dec_inp = torch.zeros_like(batch_y[:, -args.pred_len:, :]).float().to(device)
    dec_inp = torch.cat([batch_y[:, :args.label_len, :], dec_inp], dim=1).float().to(device)
    
    with torch.no_grad():
        outputs, attentions = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
        print(f"📊 Attention shape: {attentions[0].shape if attentions else 'N/A'}")
else:
    print("💡 Tip: Set args.output_attention=True for attention analysis")

# Performance per target analysis
if preds.size > 0:
    target_names = ['log_Open', 'log_High', 'log_Low', 'log_Close']
    print("\n🎯 Per-target Performance:")
    
    for i, target in enumerate(target_names):
        target_preds = preds[:, :, i].flatten()
        target_trues = trues[:, :, i].flatten()
        
        target_mse = np.mean((target_preds - target_trues) ** 2)
        target_mae = np.mean(np.abs(target_preds - target_trues))
        
        print(f"   {target:10s}: MSE={target_mse:.6f}, MAE={target_mae:.6f}")

print("\n✅ Medium configuration analysis complete!")