Basic Imports and Device Setup

In [None]:
# Standard Libraries
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# PyTorch Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [None]:
# Scikit-learn for Preprocessing and Metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.manifold import TSNE
from scipy.stats import gaussian_kde

In [None]:
import os
import requests
import pickle

In [None]:
# Visualization Settings
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
# Reproducibility: Set random seed for consistent results
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

In [None]:
# Device Setup: Enable GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Optional: Display CUDA device information
if device.type == "cuda":
    print(f"CUDA Device Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Device Count: {torch.cuda.device_count()}")
    print(f"CUDA Available: {torch.cuda.is_available()}")

In [None]:
# Utility function to check memory usage (if needed during training)
def print_memory_usage():
    if device.type == "cuda":
        allocated = torch.cuda.memory_allocated() / 1024**2
        reserved = torch.cuda.memory_reserved() / 1024**2
        print(f"Memory Allocated: {allocated:.2f} MB")
        print(f"Memory Reserved: {reserved:.2f} MB")

In [None]:
# Confirming imports and environment setup
print("Basic imports and device setup complete.")

Dataset Class, Loading, and Preprocessing

In [None]:
# Define Dataset Class
class MTSDataset(Dataset):
    """
    Custom PyTorch Dataset for Multivariate Time Series (MTS) forecasting.

    Args:
        data (numpy.ndarray): Raw time series data (shape: [samples, features]).
        seq_length (int): Length of the input sequence for forecasting.
        forecast_length (int): Length of the prediction horizon.
        normalize (bool): Whether to apply normalization to the data.
    """
    def __init__(self, data, seq_length, forecast_length, normalize=True):
        if data is None or data.size == 0:
            raise ValueError("Data is empty or not properly loaded.")
        
        self.data = data
        self.seq_length = seq_length
        self.forecast_length = forecast_length
        self.normalize = normalize

        # Normalize data using StandardScaler if enabled
        if self.normalize:
            self.scaler = StandardScaler()
            self.data = self.scaler.fit_transform(self.data)

    def __len__(self):
        # Calculate dataset length considering input and forecast horizons
        return len(self.data) - self.seq_length - self.forecast_length

    def __getitem__(self, idx):
        # Extract input (X) and target (Y) sequences
        x = self.data[idx: idx + self.seq_length]
        y = self.data[idx + self.seq_length: idx + self.seq_length + self.forecast_length]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

In [None]:
# Automatic Dataset Download Function
def download_datasets(dataset_names, save_path="datasets"):
    """
    Automatically download multiple datasets for MTS forecasting.

    Args:
        dataset_names (list): List of dataset names to download (e.g., ['ETT', 'METR-LA']).
        save_path (str): Directory to save the downloaded datasets.

    Returns:
        dict: Paths to the downloaded datasets.
    """
    os.makedirs(save_path, exist_ok=True)
    dataset_urls = {
        "ETT": "https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTm2.csv",
        "METR-LA": "https://raw.githubusercontent.com/liyaguang/DCRNN/master/data/sensor_graph/adj_mx.pkl",
        "PEMS-BAY": "https://raw.githubusercontent.com/liyaguang/DCRNN/master/data/sensor_graph/adj_mx_bay.pkl"
    }

    downloaded_paths = {}
    for dataset_name in dataset_names:
        if dataset_name not in dataset_urls:
            print(f"Dataset {dataset_name} is not available for automatic download.")
            continue

        url = dataset_urls[dataset_name]
        file_name = os.path.join(save_path, os.path.basename(url))

        # Download the dataset if it doesn't already exist
        if not os.path.exists(file_name):
            print(f"Downloading {dataset_name} dataset from {url}...")
            try:
                response = requests.get(url, stream=True)
                response.raise_for_status()  # Raise an HTTPError for bad responses
                with open(file_name, "wb") as f:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                print(f"{dataset_name} dataset downloaded successfully.")
            except requests.exceptions.RequestException as e:
                print(f"Failed to download {dataset_name} dataset. Error: {e}")
                continue

        downloaded_paths[dataset_name] = file_name

    return downloaded_paths

In [None]:
# Load Dataset Function with Format-Specific Handling
def load_data(dataset_name, file_path, delimiter=','):
    """
    Load and preprocess datasets, adapting to specific formats (e.g., .csv, .pkl).

    Args:
        dataset_name (str): Name of the dataset being loaded.
        file_path (str): Path to the dataset file.
        delimiter (str): Delimiter used in the CSV file.

    Returns:
        numpy.ndarray or dictionary: Loaded dataset as a NumPy array (for numerical datasets) 
                                      or dictionary (for .pkl datasets).
    """
    print(f"Loading {dataset_name} dataset from {file_path}...")

    try:
        # Handle binary `.pkl` files (e.g., METR-LA, PEMS-BAY)
        if dataset_name in ["METR-LA", "PEMS-BAY"]:
            with open(file_path, "rb") as f:
                data = pickle.load(f, encoding="latin1")  # Use 'latin1' to properly decode non-ASCII characters
            print(f"{dataset_name} dataset loaded successfully as a dictionary.")
            return data

        # Handle CSV files (e.g., ETT)
        elif dataset_name == "ETT":
            data = pd.read_csv(file_path, sep=delimiter, on_bad_lines="skip")
            if isinstance(data.iloc[0, 0], str):
                print("Detected timestamp column. Excluding it from the dataset.")
                data = data.iloc[:, 1:]  # Exclude timestamp column
            if data.empty:
                raise ValueError(f"The dataset at {file_path} is empty or not properly formatted.")
            return data.values

        else:
            raise ValueError(f"Loading logic for {dataset_name} is not yet implemented.")

    except Exception as e:
        print(f"Error: An issue occurred while loading the {dataset_name} dataset.")
        print(f"Details: {e}")
        return None

In [None]:
# Example Usage: Download and Load Multiple Datasets
try:
    dataset_names = ["ETT", "METR-LA", "PEMS-BAY"]

    downloaded_paths = download_datasets(dataset_names)

    loaded_data = {}
    for dataset_name, file_path in downloaded_paths.items():
        raw_data = load_data(dataset_name, file_path)

        if dataset_name == "ETT":
            seq_length = 12  # Length of historical input sequence
            forecast_length = 12  # Length of prediction horizon
            dataset = MTSDataset(raw_data, seq_length=seq_length, forecast_length=forecast_length)
            loaded_data[dataset_name] = dataset
            print(f"ETT Dataset initialized with {len(dataset)} samples.")

except Exception as e:
    print(f"Error occurred: {e}")