# PatchTST + HSQP Implementation for Time Series Forecasting

This notebook integrates the Hierarchical Symbolic-Quantized Patching (HSQP) method as a plugin into the original **PatchTST** framework. It is designed to run experiments on one dataset at a time to mitigate the 'session crashed after using all available RAM' issue.

**Datasets to be used:** `electricity`, `ETTh1`, `ETTh2`, `ETTm1`, `ETTm2`, `weather`, `traffic`, and `national_illness`.

## 1. Setup and Dependencies

In [None]:
# Install necessary libraries
!pip install torch numpy pandas scikit-learn
!pip install --upgrade scikit-learn

# Clone the PatchTST repository
!git clone https://github.com/yuqinie98/PatchTST.git
%cd PatchTST/PatchTST_supervised

# Set up the environment path
import sys
sys.path.append('./')
sys.path.append('./models')
sys.path.append('./layers')
sys.path.append('./exp')

print("Setup complete. PatchTST repository cloned and paths configured.")

## 2. HSQP Plugin Implementation

The HSQP implementation is provided below. This code will be saved as `models/hsqp_plugin.py` within the cloned repository structure.

In [None]:
%%writefile ../models/hsqp_plugin.py
"""
Hierarchical Symbolic-Quantized Patching (HSQP) Implementation as a Plugin.

This module contains the necessary classes for the HSQP method, designed to be
integrated into existing time series models like PatchTST.
"""

import numpy as np
import torch
import torch.nn as nn
from sklearn.cluster import KMeans
from typing import List, Tuple, Dict, Union, Optional

# --- 1. TimeSeriesPatching ---
class TimeSeriesPatching:
    """
    Class for creating patches from time series data (Step 2 in HSQP).
    """
    def __init__(self, patch_length: int = 24, stride: int = 12, overlap: bool = True):
        """
        Initialize the patching parameters.
        
        Args:
            patch_length: Length of each patch
            stride: Step size between patches (if overlap=True)
            overlap: Whether patches should overlap
        """
        self.patch_length = patch_length
        self.stride = stride if overlap else patch_length
        self.overlap = overlap
        
    def create_patches(self, time_series: np.ndarray) -> np.ndarray:
        """
        Create patches from a time series.
        
        Args:
            time_series: Time series data of shape [batch_size, seq_length, features]
                         or [seq_length, features] or [seq_length]
            
        Returns:
            Patches of shape [batch_size, num_patches, patch_length, features]
                         or [num_patches, patch_length, features]
                         or [num_patches, patch_length]
        """
        # Handle different input shapes
        original_shape = time_series.shape
        if len(original_shape) == 1:
            # Convert [seq_length] to [seq_length, 1]
            time_series = time_series.reshape(-1, 1)
            seq_length, features = time_series.shape
            batch_size = None
        elif len(original_shape) == 2:
            # [seq_length, features]
            seq_length, features = time_series.shape
            batch_size = None
        else:
            # [batch_size, seq_length, features]
            batch_size, seq_length, features = time_series.shape
        
        # Calculate number of patches
        num_patches = (seq_length - self.patch_length) // self.stride + 1
        
        if batch_size is None:
            # Initialize patches array
            patches = np.zeros((num_patches, self.patch_length, features))
            
            # Create patches
            for i in range(num_patches):
                start_idx = i * self.stride
                end_idx = start_idx + self.patch_length
                patches[i] = time_series[start_idx:end_idx]
                
            # Restore original dimensionality if input was 1D
            if len(original_shape) == 1:
                patches = patches.reshape(num_patches, self.patch_length)
        else:
            # Initialize patches array for batched data
            patches = np.zeros((batch_size, num_patches, self.patch_length, features))
            
            # Create patches
            for b in range(batch_size):
                for i in range(num_patches):
                    start_idx = i * self.stride
                    end_idx = start_idx + self.patch_length
                    patches[b, i] = time_series[b, start_idx:end_idx]
        
        return patches
    
    def merge_patches(self, patches: np.ndarray, original_length: Optional[int] = None) -> np.ndarray:
        """
        Merge patches back into a time series.
        For overlapping regions, values are averaged.
        
        Args:
            patches: Patches of shape [batch_size, num_patches, patch_length, features]
                     or [num_patches, patch_length, features]
                     or [num_patches, patch_length]
            original_length: Original sequence length (optional)
            
        Returns:
            Reconstructed time series
        """
        # Handle different input shapes
        original_shape = patches.shape
        if len(original_shape) == 2:
            # [num_patches, patch_length] -> [num_patches, patch_length, 1]
            patches = patches.reshape(original_shape[0], original_shape[1], 1)
            num_patches, patch_length, features = patches.shape
            batch_size = None
        elif len(original_shape) == 3:
            # [num_patches, patch_length, features]
            num_patches, patch_length, features = patches.shape
            batch_size = None
        else:
            # [batch_size, num_patches, patch_length, features]
            batch_size, num_patches, patch_length, features = patches.shape
        
        # Calculate reconstructed sequence length
        if original_length is None:
            seq_length = (num_patches - 1) * self.stride + patch_length
        else:
            seq_length = original_length
        
        if batch_size is None:
            # Initialize reconstructed time series and count array for averaging
            reconstructed = np.zeros((seq_length, features))
            counts = np.zeros((seq_length, features))
            
            # Merge patches
            for i in range(num_patches):
                start_idx = i * self.stride
                end_idx = start_idx + patch_length
                reconstructed[start_idx:end_idx] += patches[i]
                counts[start_idx:end_idx] += 1
                
            # Average overlapping regions
            reconstructed = reconstructed / np.maximum(counts, 1)
            
            # Restore original dimensionality if input was 2D
            if len(original_shape) == 2:
                reconstructed = reconstructed.reshape(seq_length)
        else:
            # Initialize reconstructed time series and count array for batched data
            reconstructed = np.zeros((batch_size, seq_length, features))
            counts = np.zeros((batch_size, seq_length, features))
            
            # Merge patches
            for b in range(batch_size):
                for i in range(num_patches):
                    start_idx = i * self.stride
                    end_idx = start_idx + patch_length
                    reconstructed[b, start_idx:end_idx] += patches[b, i]
                    counts[b, start_idx:end_idx] += 1
                    
            # Average overlapping regions
            reconstructed = reconstructed / np.maximum(counts, 1)
        
        return reconstructed


# --- 2. ABBASymbolicAggregation ---
class ABBASymbolicAggregation:
    """
    Implementation of ABBA (Aggregation-Based Amplitude Scaling) for symbolic pattern extraction (Step 3 in HSQP).
    This is a simplified implementation based on the fABBA library concepts.
    """
    def __init__(self, tol: float = 0.1, alpha: float = 0.1, sorting: str = '2-norm', scl: float = 1, k: int = 10):
        """
        Initialize ABBA parameters.
        
        Args:
            tol: Tolerance for compression
            alpha: Parameter for digitization
            sorting: Method for sorting ('2-norm', 'area', etc.)
            scl: Scaling factor
            k: Number of symbols/clusters
        """
        self.tol = tol
        self.alpha = alpha
        self.sorting = sorting
        self.scl = scl
        self.k = k
        self.parameters = None
        self.kmeans = None
        
    def compress(self, ts: np.ndarray) -> List[Tuple[float, float]]:
        """
        Compress time series into piecewise linear segments (polygonal chain).
        
        Args:
            ts: Time series data
            
        Returns:
            List of (len, inc) tuples representing the polygonal segments
        """
        # Ensure ts is a 1D array
        ts = np.asarray(ts).flatten()
        n = len(ts)
        
        # Initialize
        pieces = []
        start_idx = 0
        
        while start_idx < n - 1:
            # Find the longest possible segment within tolerance
            end_idx = start_idx + 1
            while end_idx < n:
                # Create a line from start to current end
                if end_idx == start_idx + 1:
                    line_segment = np.array([ts[start_idx], ts[end_idx]])
                else:
                    t = np.linspace(0, 1, end_idx - start_idx + 1)
                    line_segment = ts[start_idx] + (ts[end_idx] - ts[start_idx]) * t
                
                # Check if the approximation is within tolerance
                if np.max(np.abs(line_segment - ts[start_idx:end_idx+1])) <= self.tol:
                    end_idx += 1
                else:
                    end_idx -= 1
                    break
            
            # If we've reached the end of the time series
            if end_idx >= n:
                end_idx = n - 1
            
            # Calculate length and increment of the segment
            length = end_idx - start_idx
            increment = ts[end_idx] - ts[start_idx]
            
            # Add the segment to pieces
            pieces.append((length, increment))
            
            # Move to the next segment
            start_idx = end_idx
        
        return pieces
    
    def digitize(self, pieces: List[Tuple[float, float]]) -> Tuple[List[str], Dict]:
        """
        Convert polygonal segments into symbolic representation.
        
        Args:
            pieces: List of (len, inc) tuples
            
        Returns:
            string: List of symbols
            parameters: Dictionary of parameters for inverse transformation
        """
        # Extract features from pieces
        features = np.array(pieces)
        
        # Normalize features if needed
        if self.scl != 1:
            features = features / self.scl
        
        # Cluster the features
        if self.kmeans is None:
            # Ensure there are enough samples for clustering
            if len(features) < self.k:
                # Fallback: if not enough data, just use a single symbol 'a'
                symbols = ['a'] * len(features)
                self.parameters = {
                    'centers': np.array([[0.0, 0.0]]), # Placeholder
                    'scl': self.scl,
                    'alpha': self.alpha
                }
                return symbols, self.parameters

            self.kmeans = KMeans(n_clusters=self.k, random_state=0, n_init='auto')
            self.kmeans.fit(features)
        
        # Get cluster assignments
        labels = self.kmeans.predict(features)
        
        # Convert to string representation (a, b, c, ...)
        symbols = [chr(97 + label) for label in labels]
        
        # Store parameters for inverse transformation
        self.parameters = {
            'centers': self.kmeans.cluster_centers_,
            'scl': self.scl,
            'alpha': self.alpha
        }
        
        return symbols, self.parameters
    
    def fit_transform(self, ts: np.ndarray) -> str:
        """
        Apply ABBA transformation to time series.
        
        Args:
            ts: Time series data
            
        Returns:
            Symbolic representation of the time series
        """
        pieces = self.compress(ts)
        symbols, _ = self.digitize(pieces)
        return ''.join(symbols)
    
    def inverse_transform(self, string: str, initial_value: float) -> np.ndarray:
        """
        Convert symbolic representation back to time series.
        
        Args:
            string: Symbolic representation
            initial_value: Initial value of the time series
            
        Returns:
            Reconstructed time series
        """
        if self.parameters is None:
            raise ValueError("ABBA model must be fitted before inverse transform")
        
        # Convert string to cluster indices
        indices = [ord(s) - 97 for s in string]
        
        # Get cluster centers
        centers = self.parameters['centers']
        
        # Scale back if needed
        if self.scl != 1:
            centers = centers * self.scl
        
        # Reconstruct pieces
        pieces = [tuple(centers[idx]) for idx in indices]
        
        # Reconstruct time series
        ts_recon = [initial_value]
        for length, increment in pieces:
            # Convert float length to integer
            length = int(round(length))
            if length < 1:
                length = 1
                
            # Create linear segment
            if length == 1:
                ts_recon.append(ts_recon[-1] + increment)
            else:
                # Linear interpolation for the segment
                start_val = ts_recon[-1]
                end_val = start_val + increment
                segment = np.linspace(start_val, end_val, length + 1)[1:] # Exclude start_val
                ts_recon.extend(segment)
        
        return np.array(ts_recon)


# --- 3. FeatureQuantization ---
class FeatureQuantization:
    """
    Quantization of ABBA-derived features for efficiency optimization (Step 4 in HSQP).
    """
    def __init__(self, bit_width: int = 8, method: str = 'affine', block_size: int = 32):
        """
        Initialize quantization parameters.
        
        Args:
            bit_width: Target bit width (e.g., 8 for INT8, 4 for INT4)
            method: Quantization method ('affine', 'abs_max')
            block_size: Block size for block-wise quantization (not fully implemented here, kept for API)
        """
        self.bit_width = bit_width
        self.method = method
        self.block_size = block_size
        self.scale = None
        self.zero_point = None
        
        # Calculate quantization range
        self.qmin = -(2 ** (bit_width - 1))
        self.qmax = 2 ** (bit_width - 1) - 1
        
    def quantize(self, features: np.ndarray) -> np.ndarray:
        """
        Quantize features to lower precision.
        
        Args:
            features: Input features
            
        Returns:
            Quantized features
        """
        if self.method == 'abs_max':
            # Absolute max quantization
            abs_max = np.max(np.abs(features))
            if abs_max == 0:
                abs_max = 1.0  # Avoid division by zero
                
            self.scale = self.qmax / abs_max
            self.zero_point = 0
            
            # Quantize
            q_features = np.round(features * self.scale)
            q_features = np.clip(q_features, self.qmin, self.qmax)
            
        elif self.method == 'affine':
            # Affine quantization
            f_min = np.min(features)
            f_max = np.max(features)
            
            if f_min == f_max:
                self.scale = 1.0
                self.zero_point = 0
            else:
                self.scale = (self.qmax - self.qmin) / (f_max - f_min)
                self.zero_point = self.qmin - round(f_min * self.scale)
            
            # Quantize
            q_features = np.round(features * self.scale + self.zero_point)
            q_features = np.clip(q_features, self.qmin, self.qmax)
            
        else:
            raise ValueError(f"Unknown quantization method: {self.method}")
        
        return q_features.astype(np.int8 if self.bit_width <= 8 else np.int16)
    
    def dequantize(self, q_features: np.ndarray) -> np.ndarray:
        """
        Dequantize features back to original precision.
        
        Args:
            q_features: Quantized features
            
        Returns:
            Dequantized features
        """
        if self.scale is None or (self.method == 'affine' and self.zero_point is None):
            raise ValueError("Quantization parameters not set. Call quantize() first.")
        
        if self.method == 'abs_max':
            return q_features / self.scale
        elif self.method == 'affine':
            return (q_features - self.zero_point) / self.scale
        else:
            raise ValueError(f"Unknown quantization method: {self.method}")


# --- 4. HSQP (Main Orchestrator) ---
class HSQP:
    """
    Hierarchical Symbolic-Quantized Patching (HSQP) for time-series tokenization.
    """
    def __init__(self, 
                 patch_length: int = 24, 
                 stride: int = 12,
                 tol: float = 0.1, 
                 alpha: float = 0.1, 
                 k: int = 26,  # Limited to 26 for a-z symbols
                 bit_width: int = 8,
                 quant_method: str = 'affine',
                 embedding_dim: int = 64):
        """
        Initialize HSQP parameters.
        
        Args:
            patch_length: Length of each patch
            stride: Step size between patches
            tol: Tolerance for ABBA compression
            alpha: Parameter for ABBA digitization
            k: Number of symbols/clusters for ABBA
            bit_width: Target bit width for quantization
            quant_method: Quantization method
            embedding_dim: Dimension for LLM embedding
        """
        self.patching = TimeSeriesPatching(patch_length=patch_length, stride=stride)
        self.abba = ABBASymbolicAggregation(tol=tol, alpha=alpha, k=k)
        self.quantization = FeatureQuantization(bit_width=bit_width, method=quant_method)
        self.embedding_dim = embedding_dim
        
        # For LLM embedding
        self.embedding = None
        
    def fit_transform(self, time_series: np.ndarray) -> Tuple[List[str], np.ndarray, List[List[Tuple[float, float]]]]:
        """
        Apply HSQP transformation to time series.
        
        Args:
            time_series: Input time series data
            
        Returns:
            symbols_list: List of symbolic representations for each patch
            quantized_features: Quantized ABBA-derived features
            pieces_list: List of polygonal segments for each patch
        """
        # Step 2: Initial Patching
        patches = self.patching.create_patches(time_series)
        
        # Handle different input shapes
        if len(patches.shape) == 4:  # [batch_size, num_patches, patch_length, features]
            batch_size, num_patches = patches.shape[0], patches.shape[1]
            is_batched = True
        else:  # [num_patches, patch_length, features] or [num_patches, patch_length]
            num_patches = patches.shape[0]
            is_batched = False
        
        # Step 3: ABBA Symbolic Aggregation
        symbols_list = []
        pieces_list = []
        
        if is_batched:
            for b in range(batch_size):
                batch_symbols = []
                batch_pieces = []
                for i in range(num_patches):
                    # Extract patch
                    if len(patches.shape) == 4:  # [batch_size, num_patches, patch_length, features]
                        patch = patches[b, i, :, 0]  # Using first feature for simplicity
                    
                    # Apply ABBA
                    pieces = self.abba.compress(patch)
                    symbols, _ = self.abba.digitize(pieces)
                    
                    batch_symbols.append(''.join(symbols))
                    batch_pieces.append(pieces)
                
                symbols_list.append(batch_symbols)
                pieces_list.append(batch_pieces)
        else:
            for i in range(num_patches):
                # Extract patch
                if len(patches.shape) == 3:  # [num_patches, patch_length, features]
                    patch = patches[i, :, 0]  # Using first feature for simplicity
                else:  # [num_patches, patch_length]
                    patch = patches[i]
                
                # Apply ABBA
                pieces = self.abba.compress(patch)
                symbols, _ = self.abba.digitize(pieces)
                
                symbols_list.append(''.join(symbols))
                pieces_list.append(pieces)
        
        # Step 4: Quantization of ABBA-Derived Features
        # Extract features from pieces
        if is_batched:
            all_features = []
            for batch_pieces in pieces_list:
                batch_features = []
                for pieces in batch_pieces:
                    # Handle case where pieces is empty
                    if not pieces:
                        batch_features.append(np.zeros((1, 2))) # Placeholder for empty patch
                    else:
                        batch_features.append(np.array(pieces))
                all_features.append(np.vstack(batch_features))
            features = np.vstack(all_features)
        else:
            all_features = []
            for pieces in pieces_list:
                # Handle case where pieces is empty
                if not pieces:
                    all_features.append(np.zeros((1, 2))) # Placeholder for empty patch
                else:
                    all_features.append(np.array(pieces))
            features = np.vstack(all_features)
        
        # Quantize features
        quantized_features = self.quantization.quantize(features)
        
        return symbols_list, quantized_features, pieces_list
    
    def create_llm_embeddings(self, quantized_features: np.ndarray) -> torch.Tensor:
        """
        Create LLM embeddings from quantized features (Step 5 in HSQP).
        
        Args:
            quantized_features: Quantized ABBA-derived features
            
        Returns:
            Embeddings for LLM
        """
        # Initialize embedding layer if not already created
        if self.embedding is None:
            # The quantized features are 2D (length, 2 features: length, increment)
            # We need to map this to the embedding_dim
            # A simple linear layer can serve as the embedding
            # The input size is 2 (length, increment)
            self.embedding = nn.Linear(2, self.embedding_dim)
            
        # Convert to torch tensor and float
        q_features_tensor = torch.from_numpy(quantized_features).float()
        
        # Pass through the linear embedding layer
        embeddings = self.embedding(q_features_tensor)
        
        return embeddings
    
    def inverse_transform(self, embeddings: torch.Tensor, original_length: int) -> np.ndarray:
        """
        Inverse transform from LLM embeddings back to time series.
        
        Args:
            embeddings: LLM embeddings (output of the LLM)
            original_length: Original sequence length
            
        Returns:
            Dequantized features (length, increment) - simplified output for plugin context.
        """
        if self.embedding is None:
            raise ValueError("Embedding layer not initialized. Call create_llm_embeddings() first.")
            
        # Placeholder for dequantized features
        return np.zeros((1, 2))


# --- 5. HSQP Plugin for PatchTST ---
class HSQP_PatchTST_Plugin(nn.Module):
    """
    HSQP Plugin to replace the standard PatchTST Patching/Embedding layer.
    
    Input: [B, L, C]
    Output: [B * C, Num_Tokens, D_Model]
    """
    def __init__(self, configs):
        super(HSQP_PatchTST_Plugin, self).__init__()
        
        # HSQP Parameters
        self.patch_length = configs.patch_len
        self.stride = configs.stride
        self.embedding_dim = configs.d_model
        self.num_channels = configs.c_in # Number of input features/channels
        
        # HSQP Initialization (one instance per channel for independent processing)
        # Using nn.ModuleList to ensure parameters are registered
        self.hsqp_channels = nn.ModuleList([ 
            HSQP(
                patch_length=self.patch_length,
                stride=self.stride,
                embedding_dim=self.embedding_dim,
                k=getattr(configs, 'hsqp_k', 26),
                tol=getattr(configs, 'hsqp_tol', 0.1),
                bit_width=getattr(configs, 'hsqp_bit_width', 8),
                quant_method=getattr(configs, 'hsqp_quant_method', 'affine')
            ) for _ in range(self.num_channels)
        ])
        
    def forward(self, x):
        """
        Forward pass of the HSQP plugin.
        
        Args:
            x: Input time series tensor of shape [Batch, Seq_Len, Channels]
        
        Returns:
            Embeddings for the Transformer encoder of shape [Batch * Channels, Num_Tokens, D_Model]
            n_vars: Number of channels (C)
        """
        B, L, C = x.shape # Batch, Seq_Len, Channels
        
        all_channel_embeddings = []
        
        # Process each channel independently (Channel-Independence in PatchTST)
        for c in range(C):
            hsqp_processor = self.hsqp_channels[c]
            
            # Extract channel data: [B, L]
            x_channel = x[:, :, c].cpu().numpy()
            
            all_batch_embeddings = []
            
            # Process each sample in the batch
            for i in range(B):
                # 1. HSQP Transformation (Patching, ABBA, Quantization)
                # The input to fit_transform is [L]
                _, quantized_features, _ = hsqp_processor.fit_transform(x_channel[i])
                
                # 2. Create LLM Embeddings
                # Output shape: [Num_Segments, D_Model]
                embeddings = hsqp_processor.create_llm_embeddings(quantized_features)
                
                all_batch_embeddings.append(embeddings)
                
            # Find max length in the current batch for this channel
            max_len = max(e.shape[0] for e in all_batch_embeddings)
            
            # Pad sequences
            padded_embeddings = []
            for e in all_batch_embeddings:
                padding_needed = max_len - e.shape[0]
                if padding_needed > 0:
                    # Pad with zeros
                    padding = torch.zeros(padding_needed, self.embedding_dim, device=e.device)
                    e = torch.cat([e, padding], dim=0)
                padded_embeddings.append(e)
                
            # Stack the batch embeddings: [B, Num_Segments, D_Model]
            channel_embeddings = torch.stack(padded_embeddings, dim=0).to(x.device)
            
            all_channel_embeddings.append(channel_embeddings)
            
        # Concatenate all channels: [B * C, Num_Segments, D_Model]
        # First, stack: [C, B, Num_Segments, D_Model]
        stacked_embeddings = torch.stack(all_channel_embeddings, dim=0)
        # Then, reshape: [B * C, Num_Segments, D_Model]
        output_embeddings = stacked_embeddings.permute(1, 0, 2, 3).reshape(B * C, -1, self.embedding_dim)
        
        return output_embeddings, C


# --- End of HSQP Plugin Implementation ---

## 3. PatchTST Model Modification

The `PatchTST_backbone.py` file is modified to conditionally use the `HSQP_PatchTST_Plugin` instead of the standard patching logic when the `use_hsqp` flag is set in the configuration. This addresses the reviewer's concern about making it a plugin.

In [None]:
%%writefile layers/PatchTST_backbone.py
__all__ = ['PatchTST_backbone']

# Cell
from typing import Callable, Optional
import torch
from torch import nn
from torch import Tensor
import torch.nn.functional as F
import numpy as np

#from collections import OrderedDict
from layers.PatchTST_layers import *
from layers.RevIN import RevIN
from models.hsqp_plugin import HSQP_PatchTST_Plugin # Import HSQP Plugin

# Cell
class PatchTST_backbone(nn.Module):
    def __init__(self, c_in:int, context_window:int, target_window:int, patch_len:int, stride:int, max_seq_len:Optional[int]=1024, 
                 n_layers:int=3, d_model=128, n_heads=16, d_k:Optional[int]=None, d_v:Optional[int]=None,
                 d_ff:int=256, norm:str='BatchNorm', attn_dropout:float=0., dropout:float=0., act:str="gelu", key_padding_mask:bool='auto',
                 padding_var:Optional[int]=None, attn_mask:Optional[Tensor]=None, res_attention:bool=True, pre_norm:bool=False, store_attn:bool=False,
                 pe:str='zeros', learn_pe:bool=True, fc_dropout:float=0., head_dropout = 0, padding_patch = None,
                 use_hsqp:bool=False, # New argument for HSQP
                 hsqp_k:int=26, hsqp_tol:float=0.1, hsqp_bit_width:int=8, hsqp_quant_method:str='affine', # HSQP args
                 pretrain_head:bool=False, head_type = 'flatten', individual = False, revin = True, affine = True, subtract_last = False,
                 verbose:bool=False, **kwargs):
        
        super().__init__()
        
        # Store configs for dynamic initialization
        self.c_in = c_in
        self.context_window = context_window
        self.target_window = target_window
        self.n_layers = n_layers
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_k
        self.d_v = d_v
        self.d_ff = d_ff
        self.norm = norm
        self.attn_dropout = attn_dropout
        self.dropout = dropout
        self.act = act
        self.key_padding_mask = key_padding_mask
        self.padding_var = padding_var
        self.attn_mask = attn_mask
        self.res_attention = res_attention
        self.pre_norm = pre_norm
        self.store_attn = store_attn
        self.pe = pe
        self.learn_pe = learn_pe
        self.verbose = verbose
        self.head_dropout = head_dropout
        self.individual = individual
        
        # RevIn
        self.revin = revin
        if self.revin: self.revin_layer = RevIN(c_in, affine=affine, subtract_last=subtract_last)
        
        # Patching
        self.patch_len = patch_len
        self.stride = stride
        self.padding_patch = padding_patch
        self.use_hsqp = use_hsqp
        
        if self.use_hsqp:
            self.hsqp_layer = HSQP_PatchTST_Plugin(self) # Pass self (configs) to the plugin
            patch_num = None # Patch num is dynamic with HSQP
        else:
            patch_num = int((context_window - patch_len)/stride + 1)
            if padding_patch == 'end': # can be modified to general case
                self.padding_patch_layer = nn.ReplicationPad1d((0, stride)) 
                patch_num += 1
        
        # Backbone 
        if self.use_hsqp:
            # TSTiEncoder will be initialized in forward pass after patch_num is determined
            self.backbone = None
        else:
            self.backbone = TSTiEncoder(c_in, patch_num=patch_num, patch_len=patch_len, max_seq_len=max_seq_len,
                                n_layers=n_layers, d_model=d_model, n_heads=n_heads, d_k=d_k, d_v=d_v, d_ff=d_ff,
                                attn_dropout=attn_dropout, dropout=dropout, act=act, key_padding_mask=key_padding_mask, padding_var=padding_var,
                                attn_mask=attn_mask, res_attention=res_attention, pre_norm=pre_norm, store_attn=store_attn,
                                pe=pe, learn_pe=learn_pe, verbose=verbose, **kwargs)

        # Head
        self.head_nf = d_model * patch_num if patch_num is not None else None
        self.n_vars = c_in
        self.pretrain_head = pretrain_head
        self.head_type = head_type
        
        if self.pretrain_head: 
            self.head = self.create_pretrain_head(self.head_nf, c_in, fc_dropout) # custom head passed as a partial func with all its kwargs
        elif head_type == 'flatten': 
            if not self.use_hsqp:
                self.head = Flatten_Head(self.individual, self.n_vars, self.head_nf, target_window, head_dropout=head_dropout)
            else:
                self.head = None # Will be initialized in forward pass
        
        self.patch_num = patch_num # Store patch_num for TSTiEncoder init
        
    
    def forward(self, z):                                                                   # z: [bs x nvars x seq_len]
        # norm
        if self.revin: 
            z = z.permute(0,2,1)
            z = self.revin_layer(z, 'norm')
            z = z.permute(0,2,1)
            
        # do patching
        if self.use_hsqp:
            # HSQP Plugin: z: [bs x nvars x seq_len] -> z: [bs * nvars x patch_num x d_model], n_vars
            # The plugin expects [B, L, C] or [B, C, L] depending on implementation. PatchTST uses [B, C, L].
            # The HSQP plugin was designed for [B, L, C] in the previous step, so we permute back to [B, L, C] for the plugin
            # and then permute back to [B, C, L] for the rest of the model if needed.
            # Let's adjust the plugin to take [B, C, L] to match PatchTST's internal flow.
            # Re-checking the plugin: it takes [B, L, C] and returns [B*C, Num_Tokens, D_Model], C
            # PatchTST's input is [B, C, L]. Let's permute to [B, L, C] for the plugin.
            z_permuted = z.permute(0, 2, 1).contiguous() # [B, L, C]
            z_enc, n_vars = self.hsqp_layer(z_permuted)
            
            # Dynamically initialize TSTiEncoder and Head if not done
            if self.backbone is None:
                patch_num = z_enc.shape[1] # Patch num is the sequence length of the tokens
                self.backbone = TSTiEncoder(self.c_in, patch_num=patch_num, patch_len=self.patch_len, max_seq_len=1024,
                                        n_layers=self.n_layers, d_model=self.d_model, n_heads=self.n_heads, d_k=self.d_k, d_v=self.d_v, d_ff=self.d_ff,
                                        attn_dropout=self.attn_dropout, dropout=self.dropout, act=self.act, key_padding_mask=self.key_padding_mask, padding_var=self.padding_var,
                                        attn_mask=self.attn_mask, res_attention=self.res_attention, pre_norm=self.pre_norm, store_attn=self.store_attn,
                                        pe=self.pe, learn_pe=self.learn_pe, verbose=self.verbose)
                
                self.head_nf = self.d_model * patch_num
                self.head = Flatten_Head(self.individual, self.n_vars, self.head_nf, self.target_window, head_dropout=self.head_dropout)
            
            # model
            # TSTiEncoder expects [bs * nvars x patch_num x d_model]
            z = self.backbone(z_enc) # z: [bs * nvars x patch_num x d_model] -> z: [bs x nvars x d_model x patch_num]
            z = self.head(z) # z: [bs x nvars x target_window]
            
        else:
            # Standard PatchTST logic
            if self.padding_patch == 'end':
                z = self.padding_patch_layer(z)
            z = z.unfold(dimension=-1, size=self.patch_len, step=self.stride)                   # z: [bs x nvars x patch_num x patch_len]
            z = z.permute(0,1,3,2)                                                              # z: [bs x nvars x patch_len x patch_num]
            
            # model
            z = self.backbone(z)                                                                # z: [bs x nvars x d_model x patch_num]
            z = self.head(z)                                                                    # z: [bs x nvars x target_window]
        
        # denorm
        if self.revin: 
            z = z.permute(0,2,1)
            z = self.revin_layer(z, 'denorm')
            z = z.permute(0,2,1)
        return z
    
    def create_pretrain_head(self, head_nf, vars, dropout):
        return nn.Sequential(nn.Dropout(dropout),
                    nn.Conv1d(head_nf, vars, 1)
                    )


class Flatten_Head(nn.Module):
    def __init__(self, individual, n_vars, nf, target_window, head_dropout=0):
        super().__init__()
        
        self.individual = individual
        self.n_vars = n_vars
        
        if self.individual:
            self.linears = nn.ModuleList()
            self.dropouts = nn.ModuleList()
            self.flattens = nn.ModuleList()
            for i in range(self.n_vars):
                self.flattens.append(nn.Flatten(start_dim=-2))
                self.linears.append(nn.Linear(nf, target_window))
                self.dropouts.append(nn.Dropout(head_dropout))
        else:
            self.flatten = nn.Flatten(start_dim=-2)
            self.linear = nn.Linear(nf, target_window)
            self.dropout = nn.Dropout(head_dropout)
            
    def forward(self, x):                                 # x: [bs x nvars x d_model x patch_num]
        if self.individual:
            x_out = []
            for i in range(self.n_vars):
                z = self.flattens[i](x[:,i,:,:])          # z: [bs x d_model * patch_num]
                z = self.linears[i](z)                    # z: [bs x target_window]
                z = self.dropouts[i](z)
                x_out.append(z)
            x = torch.stack(x_out, dim=1)                 # x: [bs x nvars x target_window]
        else:
            x = self.flatten(x)
            x = self.linear(x)
            x = self.dropout(x)
        return x
        
        
    
    
class TSTiEncoder(nn.Module):  #i means channel-independent
    def __init__(self, c_in, patch_num, patch_len, max_seq_len=1024,
                 n_layers=3, d_model=128, n_heads=16, d_k=None, d_v=None,
                 d_ff=256, norm='BatchNorm', attn_dropout=0., dropout=0., act="gelu", store_attn=False,
                 key_padding_mask='auto', padding_var=None, attn_mask=None, res_attention=True, pre_norm=False,
                 pe='zeros', learn_pe=True, verbose=False, **kwargs):
        
        
        super().__init__()
        
        self.patch_num = patch_num
        self.patch_len = patch_len
        
        # Input encoding
        q_len = patch_num
        self.W_P = nn.Linear(patch_len, d_model)        # Eq 1: projection of feature vectors onto a d-dim vector space
        self.seq_len = q_len

        # Positional encoding
        self.W_pos = positional_encoding(pe, learn_pe, q_len, d_model)

        # Residual dropout
        self.dropout = nn.Dropout(dropout)

        # Encoder
        self.encoder = TSTEncoder(q_len, d_model, n_heads, d_k=d_k, d_v=d_v, d_ff=d_ff, norm=norm, attn_dropout=attn_dropout, dropout=dropout,
                                   pre_norm=pre_norm, activation=act, res_attention=res_attention, n_layers=n_layers, store_attn=store_attn)

        
    def forward(self, x) -> Tensor:                                              # x: [bs x nvars x patch_len x patch_num] or [bs * nvars x patch_num x d_model] (for HSQP)
        
        # Check if input is already embedded (HSQP case)
        if x.dim() == 3: # [bs * nvars x patch_num x d_model]
            u = x
            # Reshape back to [bs x nvars x patch_num x d_model] for positional encoding
            # This is a hacky fix for the TSTiEncoder structure, but necessary for dynamic patch_num
            # We need to know B and n_vars from the caller, but TSTiEncoder doesn't have it.
            # The original PatchTST uses [bs * nvars x patch_num x d_model] after W_P.
            # Let's assume the input is already in the correct shape for the encoder part.
            # The input to TSTiEncoder is [bs * nvars x patch_num x d_model] after W_P and reshape.
            # The HSQP plugin already returns [B * C, Num_Tokens, D_Model], which is the correct shape for the encoder.
            
            # Positional encoding and dropout
            u = self.dropout(u + self.W_pos)
            
            # Encoder
            z = self.encoder(u)                                                      # z: [bs * nvars x patch_num x d_model]
            
            # Reshape for head
            # The head expects [bs x nvars x d_model x patch_num]
            # We need B and n_vars from the caller (PatchTST_backbone)
            # Since TSTiEncoder is now dynamically initialized, we need to pass B and n_vars from PatchTST_backbone
            # This is getting too complex for a quick plugin. Let's simplify the TSTiEncoder for HSQP.
            # The PatchTST_backbone already handles the reshape back to [bs x nvars x d_model x patch_num] for the head.
            # TSTiEncoder's job is to take [bs * nvars x patch_num x d_model] and return [bs * nvars x patch_num x d_model]
            
            # The original TSTiEncoder is designed to take [bs x nvars x patch_len x patch_num] and return [bs x nvars x d_model x patch_num]
            # Let's keep the original TSTiEncoder logic and adjust the PatchTST_backbone to handle the HSQP output.
            
            # Revert TSTiEncoder to original logic and remove the HSQP-specific logic here.
            # The HSQP logic is handled in PatchTST_backbone.
            
            # TSTiEncoder (Original Logic)
            n_vars = x.shape[1]
            # Input encoding
            x = x.permute(0,1,3,2)                                                   # x: [bs x nvars x patch_num x patch_len]
            x = self.W_P(x)                                                          # x: [bs x nvars x patch_num x d_model]

            u = torch.reshape(x, (x.shape[0]*x.shape[1],x.shape[2],x.shape[3]))      # u: [bs * nvars x patch_num x d_model]
            u = self.dropout(u + self.W_pos)                                         # u: [bs * nvars x patch_num x d_model]

            # Encoder
            z = self.encoder(u)                                                      # z: [bs * nvars x patch_num x d_model]
            z = torch.reshape(z, (-1,n_vars,z.shape[-2],z.shape[-1]))                # z: [bs x nvars x patch_num x d_model]
            z = z.permute(0,1,3,2)                                                   # z: [bs x nvars x d_model x patch_num]
            
            return z
        
        # TSTiEncoder (HSQP Logic - simplified)
        else: # x is [bs * nvars x patch_num x d_model] from HSQP
            u = x
            # Positional encoding and dropout
            u = self.dropout(u + self.W_pos)
            
            # Encoder
            z = self.encoder(u)                                                      # z: [bs * nvars x patch_num x d_model]
            
            # We return z as [bs * nvars x patch_num x d_model] and let PatchTST_backbone handle the final reshape
            return z


class TSTEncoder(nn.Module):
    # ... (rest of the TSTEncoder and TSTEncoderLayer classes are unchanged)
    def __init__(self, q_len, d_model, n_heads, d_k=None, d_v=None, d_ff=None, 
                        norm='BatchNorm', attn_dropout=0., dropout=0., activation='gelu',
                        res_attention=False, n_layers=1, pre_norm=False, store_attn=False):
        super().__init__()

        self.layers = nn.ModuleList([TSTEncoderLayer(q_len, d_model, n_heads=n_heads, d_k=d_k, d_v=d_v, d_ff=d_ff, norm=norm,
                                                      attn_dropout=attn_dropout, dropout=dropout,
                                                      activation=activation, res_attention=res_attention,
                                                      pre_norm=pre_norm, store_attn=store_attn) for i in range(n_layers)])
        self.res_attention = res_attention

    def forward(self, src:Tensor, key_padding_mask:Optional[Tensor]=None, attn_mask:Optional[Tensor]=None):
        output = src
        scores = None
        if self.res_attention:
            for mod in self.layers: output, scores = mod(output, prev=scores, key_padding_mask=key_padding_mask, attn_mask=attn_mask)
            return output
        else:
            for mod in self.layers: output = mod(output, key_padding_mask=key_padding_mask, attn_mask=attn_mask)
            return output


class TSTEncoderLayer(nn.Module):
    def __init__(self, q_len, d_model, n_heads, d_k=None, d_v=None, d_ff=256, store_attn=False,
                 norm='BatchNorm', attn_dropout=0, dropout=0., bias=True, activation="gelu", res_attention=False, pre_norm=False):
        super().__init__()
        assert not d_model%n_heads, f"d_model ({d_model}) must be divisible by n_heads ({n_heads})"
        d_k = d_model // n_heads if d_k is None else d_k
        d_v = d_model // n_heads if d_v is None else d_v

        # Multi-Head attention
        self.res_attention = res_attention
        self.self_attn = _MultiheadAttention(d_model, n_heads, d_k, d_v, attn_dropout=attn_dropout, proj_dropout=dropout, res_attention=res_attention)

        # Add & Norm
        self.dropout_attn = nn.Dropout(dropout)
        if "batch" in norm.lower():
            self.norm_attn = nn.Sequential(Transpose(1,2), nn.BatchNorm1d(d_model), Transpose(1,2))
        else:
            self.norm_attn = nn.LayerNorm(d_model)

        # Position-wise Feed-Forward
        self.ff = nn.Sequential(nn.Linear(d_model, d_ff, bias=bias),
                                get_activation_fn(activation),
                                nn.Dropout(dropout),
                                nn.Linear(d_ff, d_model, bias=bias))

        # Add & Norm
        self.dropout_ffn = nn.Dropout(dropout)
        if "batch" in norm.lower():
            self.norm_ffn = nn.Sequential(Transpose(1,2), nn.BatchNorm1d(d_model), Transpose(1,2))
        else:
            self.norm_ffn = nn.LayerNorm(d_model)

        self.pre_norm = pre_norm
        self.store_attn = store_attn


    def forward(self, src:Tensor, prev:Optional[Tensor]=None, key_padding_mask:Optional[Tensor]=None, attn_mask:Optional[Tensor]=None) -> Tensor:

        # Multi-Head attention sublayer
        if self.pre_norm:
            src = self.norm_attn(src)
        ## Multi-Head attention
        if self.res_attention:
            src2, attn, scores = self.self_attn(src, src, src, prev, key_padding_mask=key_padding_mask, attn_mask=attn_mask)
        else:
            src2, attn = self.self_attn(src, src, src, key_padding_mask=key_padding_mask, attn_mask=attn_mask)
            scores = None

        if self.store_attn: 
            self.attn = attn
        
        ## Add & Norm
        src = src + self.dropout_attn(src2)
        if not self.pre_norm:
            src = self.norm_attn(src)

        # Feed-forward sublayer
        if self.pre_norm:
            src = self.norm_ffn(src)
        ## Position-wise Feed-Forward
        src2 = self.ff(src)
        
        ## Add & Norm
        src = src + self.dropout_ffn(src2)
        if not self.pre_norm:
            src = self.norm_ffn(src)

        return src, scores


# --- End of PatchTST_backbone.py Modification ---

## 4. Data Preparation and Experiment Loop

This section includes instructions for uploading the datasets and the main loop to run the experiments one dataset at a time.

### 4.1 Upload Datasets

**Action Required:** Please upload your CSV files (`electricity.csv`, `ETTh1.csv`, `ETTh2.csv`, `ETTm1.csv`, `ETTm2.csv`, `weather.csv`, `traffic.csv`, and `national_illness.csv`) to the Colab environment's file system. A common practice is to create a `data` folder inside the `PatchTST_supervised` directory and place them there.

In [None]:
# Create data directory and move files (assuming they are uploaded to the root)
!mkdir -p data
# If you uploaded the files to the root of the Colab environment, run the following:
# !mv ../../<your_uploaded_file>.csv data/

print("Please ensure your datasets are in the 'data' folder inside the 'PatchTST_supervised' directory.")

### 4.2 Experiment Configuration and Execution

We will define a configuration class and a main function to run the experiments. We will iterate over the datasets, running the experiment for each one, and collecting the results.

In [None]:
import argparse
import os
import torch
import numpy as np
from exp.exp_main import Exp_Main
from utils.tools import setting

# Define the list of datasets
DATASETS = [
    'electricity', 'ETTh1', 'ETTh2', 'ETTm1', 'ETTm2', 'weather', 'traffic', 'national_illness'
]

def get_default_args(dataset_name, use_hsqp=False):
    """Generates a default set of arguments for the experiment."""
    parser = argparse.ArgumentParser(description='PatchTST Forecasting')
    
    # Basic Config
    parser.add_argument('--model', type=str, default='PatchTST', help='model name, PatchTST')
    parser.add_argument('--data', type=str, default=dataset_name, help='dataset name')
    parser.add_argument('--root_path', type=str, default='./data/', help='root path of the data file')
    parser.add_argument('--data_path', type=str, default=f'{dataset_name}.csv', help='data file')
    parser.add_argument('--features', type=str, default='M', help='forecasting task, options:[M, S, MS]; M:multivariate, S:univariate, MS:multivariate for S-model')
    parser.add_argument('--target', type=str, default='OT', help='target feature in S or MS task')
    parser.add_argument('--freq', type=str, default='h', help='freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], A:annually, custom:freq(e.g. 15min)')
    parser.add_argument('--checkpoints', type=str, default='./checkpoints/', help='location of model checkpoints')

    # Forecasting Task
    parser.add_argument('--seq_len', type=int, default=96, help='input sequence length')
    parser.add_argument('--label_len', type=int, default=48, help='start token length')
    parser.add_argument('--pred_len', type=int, default=96, help='prediction sequence length')

    # Model Config
    parser.add_argument('--enc_in', type=int, default=7, help='encoder input size')
    parser.add_argument('--d_model', type=int, default=128, help='dimension of model')
    parser.add_argument('--d_ff', type=int, default=256, help='dimension of fcn')
    parser.add_argument('--n_heads', type=int, default=16, help='num of heads')
    parser.add_argument('--e_layers', type=int, default=3, help='num of encoder layers')
    parser.add_argument('--dropout', type=float, default=0.1, help='dropout')
    parser.add_argument('--patch_len', type=int, default=16, help='patch length')
    parser.add_argument('--stride', type=int, default=8, help='stride')
    parser.add_argument('--individual', action='store_true', default=False, help='DLinear: Individual or not')
    parser.add_argument('--revin', action='store_true', default=True, help='RevIN')
    parser.add_argument('--affine', action='store_true', default=True, help='RevIN-affine')
    parser.add_argument('--subtract_last', action='store_true', default=False, help='RevIN-subtract_last')
    
    # HSQP Config
    parser.add_argument('--use_hsqp', action='store_true', default=use_hsqp, help='whether to use HSQP plugin')
    parser.add_argument('--hsqp_k', type=int, default=26, help='number of symbols for ABBA')
    parser.add_argument('--hsqp_tol', type=float, default=0.1, help='tolerance for ABBA compression')
    parser.add_argument('--hsqp_bit_width', type=int, default=8, help='bit width for quantization')
    parser.add_argument('--hsqp_quant_method', type=str, default='affine', help='quantization method')
    
    # Training Config
    parser.add_argument('--train_epochs', type=int, default=10, help='train epochs')
    parser.add_argument('--batch_size', type=int, default=32, help='batch size of train input data')
    parser.add_argument('--patience', type=int, default=3, help='early stopping patience')
    parser.add_argument('--learning_rate', type=float, default=0.0001, help='optimizer learning rate')
    parser.add_argument('--loss', type=str, default='mse', help='loss function')
    parser.add_argument('--lradj', type=str, default='type1', help='adjust learning rate')
    parser.add_argument('--use_gpu', type=bool, default=True, help='use gpu')
    parser.add_argument('--gpu', type=int, default=0, help='gpu')
    parser.add_argument('--use_multi_gpu', action='store_true', default=False, help='use multiple gpus')
    parser.add_argument('--devices', type=str, default='0,1,2,3', help='device ids of multile gpus')
    parser.add_argument('--seed', type=int, default=2021, help='random seed')
    
    # Other Config
    parser.add_argument('--num_workers', type=int, default=10, help='num of workers')
    parser.add_argument('--itr', type=int, default=1, help='experiments times')
    parser.add_argument('--des', type=str, default='test', help='exp description')
    parser.add_argument('--inverse', action='store_true', help='inverse output data', default=False)
    parser.add_argument('--use_amp', action='store_true', help='use automatic mixed precision training', default=False)
    parser.add_argument('--task_name', type=str, default='long_term_forecast', help='task name, options:[long_term_forecast, short_term_forecast, imputation, classification]')
    parser.add_argument('--train_only', action='store_true', help='train only', default=False)
    
    # Parse arguments and set defaults for Colab
    args = parser.parse_args([]) # Pass empty list to avoid reading from command line
    
    # Adjust parameters based on dataset (as in original PatchTST)
    if dataset_name == 'ETTh1' or dataset_name == 'ETTh2':
        args.enc_in = 7
    elif dataset_name == 'ETTm1' or dataset_name == 'ETTm2':
        args.enc_in = 7
    elif dataset_name == 'electricity':
        args.enc_in = 321
    elif dataset_name == 'traffic':
        args.enc_in = 862
    elif dataset_name == 'weather':
        args.enc_in = 21
    elif dataset_name == 'national_illness':
        args.enc_in = 1
        args.features = 'S'
        args.target = 'y'
        args.freq = 'w'
        
    # Set device
    args.use_gpu = torch.cuda.is_available()
    args.devices = '0'
    
    return args

def run_experiment(dataset_name, use_hsqp):
    """Runs the PatchTST experiment for a single dataset and configuration."""
    print(f"\n{'='*50}")
    print(f"Starting experiment for Dataset: {dataset_name}, HSQP: {use_hsqp}")
    print(f"{'='*50}")
    
    args = get_default_args(dataset_name, use_hsqp)
    
    # Set up the experiment environment
    setting(args)
    
    # Initialize and run the experiment
    Exp = Exp_Main
    exp = Exp(args)
    
    # Train and Test
    print('>>>>>>>start training>>>>>>>>>>>>>>>>>>>>>>>>>>')
    exp.train(setting)
    
    print('>>>>>>>start testing>>>>>>>>>>>>>>>>>>>>>>>>>>')
    mae, mse = exp.test(setting, test=1)
    
    print(f"Experiment finished for {dataset_name}. MAE: {mae:.4f}, MSE: {mse:.4f}")
    
    # Clean up to free memory before the next run
    del exp
    torch.cuda.empty_cache()
    
    return mae, mse

if __name__ == '__main__':
    all_results = []
    
    for dataset in DATASETS:
        # Run without HSQP (Baseline)
        mae_base, mse_base = run_experiment(dataset, use_hsqp=False)
        all_results.append({
            'Dataset': dataset,
            'Method': 'PatchTST (Baseline)',
            'MAE': mae_base,
            'MSE': mse_base
        })
        
        # Run with HSQP
        mae_hsqp, mse_hsqp = run_experiment(dataset, use_hsqp=True)
        all_results.append({
            'Dataset': dataset,
            'Method': 'PatchTST + HSQP',
            'MAE': mae_hsqp,
            'MSE': mse_hsqp
        })
        
    # Display final results
    import pandas as pd
    results_df = pd.DataFrame(all_results)
    print("\n" + "#"*50)
    print("Final Experiment Results")
    print("#"*50)
    print(results_df.to_markdown(index=False))
    
    # Save results to a file
    results_df.to_csv('experiment_results.csv', index=False)
    print("Results saved to experiment_results.csv")

## 5. Run the Experiments

After completing the setup and ensuring your datasets are in the `data` folder, run the following cell to start the full experiment loop. This will run the baseline and HSQP version for each dataset sequentially.

In [None]:
# Run the main experiment loop
!python PatchTST_HSQP_Colab.ipynb