In [None]:
import os
import pickle
import logging
import yaml
import tempfile
from einops import rearrange

import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
from pathlib import Path

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error


def get_root_dir():
    """Returns the root directory for the project."""
    return Path("/content")


def load_yaml_param_settings(yaml_fname: str):
    """Loads parameters from a YAML file."""
    with open(yaml_fname, "r") as stream:
        config = yaml.load(stream, Loader=yaml.FullLoader)
    return config


def preprocess(df, scaler: MinMaxScaler, kind: str):
    """Normalizes and standardizes data using MinMaxScaler."""
    df = np.asarray(df, dtype=np.float32)

    if len(df.shape) == 1:
        raise ValueError("Data must be a 2-D array")

    if np.any(np.isnan(df).sum() != 0):
        print("Data contains null values. Will be replaced with 0")
        df = np.nan_to_num(df)  # Pass df to nan_to_num

    if kind == "train":
        df = scaler.fit_transform(df)
    elif kind == "test":
        df = scaler.transform(df)
    print("Data normalized")

    return df


def minibatch_slices_iterator(length, step_size, ignore_incomplete_batch=False):
    """Iterates through mini-batch slices."""
    start = 0
    stop1 = (length // step_size) * step_size
    while start < stop1:
        yield slice(start, start + step_size, 1)
        start += step_size
    if not ignore_incomplete_batch and start < length:
        yield slice(start, length, 1)


class BatchSlidingWindow(object):
    """Class for obtaining mini-batch iterators of sliding windows."""

    def __init__(
        self, array_size, window_size, step_size, batch_size, excludes=None,
        shuffle=False, ignore_incomplete_batch=False
    ):
        if window_size < 1:
            raise ValueError("`window_size` must be at least 1")
        if array_size < window_size:
            raise ValueError("`array_size` must be at least as large as `window_size`")
        if excludes is not None:
            excludes = np.asarray(excludes, dtype=np.bool_)
            expected_shape = (array_size,)
            if excludes.shape != expected_shape:
                raise ValueError(
                    f"The shape of `excludes` is expected to be {expected_shape}, but got {excludes.shape}"
                )

        if excludes is not None:
            mask = np.logical_not(excludes)
        else:
            mask = np.ones([array_size], dtype=np.bool_)
        mask[: window_size - 1] = False
        where_excludes = np.where(excludes)[0]
        for k in range(1, window_size):
            also_excludes = where_excludes + k
            also_excludes = also_excludes[also_excludes < array_size]
            mask[also_excludes] = False

        indices = np.arange(array_size)[mask]
        self._indices = indices.reshape([-1, 1])

        self._offsets = np.arange(-window_size + 1, 1)

        self._array_size = array_size
        self._window_size = window_size
        self._step_size = step_size
        self._batch_size = batch_size
        self._shuffle = shuffle
        self._ignore_incomplete_batch = ignore_incomplete_batch

    def get_iterator(self, arrays):
        """Iterate through the sliding windows of each array in `arrays`."""
        arrays = tuple(np.asarray(a) for a in arrays)
        if not arrays:
            raise ValueError("`arrays` must not be empty")

        if self._shuffle:
            np.random.shuffle(self._indices)

        for s in minibatch_slices_iterator(
            length=len(self._indices),
            step_size=self._step_size,
            ignore_incomplete_batch=self._ignore_incomplete_batch,
        ):
            idx = self._indices[s] + self._offsets
            yield tuple(a[idx] if len(a.shape) == 1 else a[idx, :] for a in arrays)


def freeze(model):
    """Freezes the parameters of a PyTorch model."""
    for param in model.parameters():
        param.requires_grad = False


def unfreeze(model):
    """Unfreezes the parameters of a PyTorch model."""
    for param in model.parameters():
        param.requires_grad = True


def save_model(models_dict: dict, dirname="saved_models", id: str = ""):
    """Saves PyTorch model state dictionaries."""
    try:
        save_path = get_root_dir().joinpath(dirname)
        if not os.path.isdir(save_path):
            os.mkdir(save_path)

        id_ = f"-{id}" if id else ""
        for model_name, model in models_dict.items():
            torch.save(model.state_dict(), save_path.joinpath(f"{model_name}{id_}.ckpt"))
    except PermissionError:
        dirname = tempfile.gettempdir()
        print(
            f"\nThe trained model is saved in the following temporary dirname due to some permission error: {dirname}.\n"
        )

        id_ = f"-{id}" if id else ""
        for model_name, model in models_dict.items():
            torch.save(model.state_dict(), Path(dirname).joinpath(f"{model_name}{id_}.ckpt"))


def time_to_timefreq(x, n_fft: int, C: int):
    """Converts time-domain signal to time-frequency representation."""
    x = rearrange(x, "b c l -> (b c) l")
    x = torch.stft(x, n_fft, normalized=False, return_complex=False)
    x = rearrange(x, "(b c) n t z -> b (c z) n t ", c=C)
    return x


def timefreq_to_time(x, n_fft: int, C: int):
    """Converts time-frequency representation back to time-domain signal."""
    x = rearrange(x, "b (c z) n t -> (b c) n t z", c=C)
    x = torch.istft(x, n_fft, normalized=False, return_complex=False)
    x = rearrange(x, "(b c) l -> b c l", c=C)
    return x


def compute_var_loss(z):
    """Computes variance loss."""
    return torch.relu(1.0 - torch.sqrt(z.var(dim=0) + 1e-4)).mean()


def compute_cov_loss(z):
    """Computes covariance loss."""
    norm_z = z - z.mean(dim=0)
    norm_z = F.normalize(norm_z, p=2, dim=0)  # (batch * feature); l2-norm
    fxf_cov_z = torch.mm(norm_z.T, norm_z)  # (feature * feature)
    ind = np.diag_indices(fxf_cov_z.shape[0])
    fxf_cov_z[ind[0], ind[1]] = torch.zeros(fxf_cov_z.shape[0]).to(norm_z.device)
    cov_loss = (fxf_cov_z**2).mean()
    return cov_loss


def quantize(z, vq_model, transpose_channel_length_axes=False):
    """Quantizes input using a VQ model."""
    input_dim = len(z.shape) - 2
    if input_dim == 2:
        h, w = z.shape[2:]
        z = rearrange(z, "b c h w -> b (h w) c")
        z_q, indices, vq_loss, perplexity = vq_model(z)
        z_q = rearrange(z_q, "b (h w) c -> b c h w", h=h, w=w)
    elif input_dim == 1:
        if transpose_channel_length_axes:
            z = rearrange(z, "b c l -> b (l) c")
        z_q, indices, vq_loss, perplexity = vq_model(z)
        if transpose_channel_length_axes:
            z_q = rearrange(z_q, "b (l) c -> b c l")
    else:
        raise ValueError
    return z_q, indices, vq_loss, perplexity


def zero_pad_high_freq(xf):
    """Zero-pads high frequencies in a time-frequency representation."""
    xf_l = torch.zeros(xf.shape).to(xf.device)
    xf_l[:, :, 0, :] = xf[:, :, 0, :]
    return xf_l


def zero_pad_low_freq(xf):
    """Zero-pads low frequencies in a time-frequency representation."""
    xf_h = torch.zeros(xf.shape).to(xf.device)
    xf_h[:, :, 1:, :] = xf[:, :, 1:, :]
    return xf_h


def compute_emb_loss(codebook, x, use_cosine_sim, esm_max_codes):
    """Computes embedding loss."""
    embed = codebook.embed
    flatten = x.reshape(-1, x.shape[-1])

    if use_cosine_sim:
        flatten = F.normalize(flatten, p=2, dim=-1)
        embed = F.normalize(embed, p=2, dim=-1)

    ind = torch.randint(0, embed.shape[0], size=(min(esm_max_codes, embed.shape[0]),))
    embed = embed[ind]

    cov_embed = torch.cov(embed.t())
    cov_x = torch.cov(flatten.t())

    mean_embed = torch.mean(embed, dim=0)
    mean_x = torch.mean(flatten, dim=0)

    esm_loss = F.mse_loss(cov_x.detach(), cov_embed) + F.mse_loss(mean_x.detach(), mean_embed)
    return esm_loss


def compute_downsample_rate(input_length: int, n_fft: int, downsampled_width: int):
    """Computes the downsample rate."""
    return (
        round(input_length / (np.log2(n_fft) - 1) / downsampled_width)
        if input_length >= downsampled_width
        else 1
    )


def compute_performance(ground_truth, predictions):
    """Computes performance metrics: Composition Error, Reconstruction RMSE, Reconstruction MAE, and MAPE."""
    ground_truth = np.asarray(ground_truth)
    predictions = np.asarray(predictions)

    if ground_truth.shape != predictions.shape:
        # Attempt to reshape predictions if they are 1D and ground_truth is 2D with one column
        if len(predictions.shape) == 1 and len(ground_truth.shape) == 2 and ground_truth.shape[1] == 1:
            predictions = predictions.reshape(-1, 1)
        else:
            raise ValueError("Ground truth and predictions must have the same shape.")

    composition_error = np.mean(np.abs(ground_truth - predictions))
    reconstruction_rmse = np.sqrt(mean_squared_error(ground_truth, predictions))
    reconstruction_mae = mean_absolute_error(ground_truth, predictions)

    # Mean Absolute Percentage Error (MAPE)
    # Avoid division by zero for ground truth values
    # A small epsilon is added to the denominator to handle zero values
    mape = np.mean(np.abs((ground_truth - predictions) / (ground_truth + 1e-8))) * 100

    return {
        "composition_error": composition_error,
        "reconstruction_rmse": reconstruction_rmse,
        "reconstruction_mae": reconstruction_mae,
        "mape": mape,
    }


def compute_compression_ratio(original_size_bytes, compressed_size_bytes):
    """Computes the compression ratio."""
    if compressed_size_bytes <= 0:
        print("Warning: Compressed size is zero or negative. Cannot compute compression ratio.")
        return None
    return original_size_bytes / compressed_size_bytes


class TimeVQVAE:
    """A placeholder class for the TimeVQVAE model."""
    def __init__(self, num_embeddings=512, embedding_dim=64):
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        # Dummy codebook for demonstration
        self.codebook = torch.randn(num_embeddings, embedding_dim)

    def tokenize(self, data):
        """Dummy tokenization: simply return random indices."""
        # Dummy tokenization: simply return random indices
        # In a real TimeVQVAE, this would involve encoding and VQ layer
        seq_len = data.shape[1] if len(data.shape) > 1 else data.shape[0]
        return np.random.randint(0, self.num_embeddings, size=(data.shape[0], seq_len // 4)) # Example token length

    def reconstruct(self, tokens):
        """Dummy reconstruction: map tokens back to random data."""
        # Dummy reconstruction: map tokens back to random data
        # In a real TimeVQVAE, this would involve decoding tokens
        batch_size, token_len = tokens.shape
        # Assuming original data length was 4 times token length
        reconstructed_len = token_len * 4
        return np.random.rand(batch_size, reconstructed_len)


def your_model_inference_with_compression(data, vqvae_model):
    """Placeholder for model inference with dummy compression and reconstruction."""
    # Convert pandas DataFrame to numpy array for processing
    numeric_data = data.select_dtypes(include=np.number).values

    # Simulate tokenization (compression)
    tokens = vqvae_model.tokenize(numeric_data)

    # Simulate reconstruction (decompression)
    predictions = vqvae_model.reconstruct(tokens)

    # Calculate original and compressed sizes
    original_size_bytes = numeric_data.nbytes
    # Assuming each token is an integer (e.g., 4 bytes per token)
    compressed_size_bytes = tokens.nbytes

    return predictions, original_size_bytes, compressed_size_bytes




In [None]:
import pandas as pd
import numpy as np
# from time_vqvae_utils import compute_performance, compute_compression_ratio, TimeVQVAE, your_model_inference_with_compression

# List of dataset names and paths
dataset_info = {
    'ETTh1': '/content/ETTh1.csv',
    'ETTh2': '/content/ETTh2.csv',
    'ETTm1': '/content/ETTm1.csv',
    'ETTm2':  '/content/ETTm2.csv',
    'Electricity': '/content/electricity.csv',
    'Traffic': '/content/traffic.csv',
    'Weather':  '/content/weather.csv',
    'National Illness': '/content/national_illness.csv'






}

# Initialize TimeVQVAE model (placeholder)
vqvae_model = TimeVQVAE(num_embeddings=512, embedding_dim=64)

# Dictionaries to store results
performance_results = {}
compression_results = {}

print("\n--- Starting TimeVQVAE Experiment ---")

for dataset_name, dataset_path in dataset_info.items():
    print(f"\n--- Processing Dataset: {dataset_name} ---")
    try:
        df = pd.read_csv(dataset_path)
        print(f"{dataset_name} loaded successfully!")
    except FileNotFoundError:
        print(f"Error: {dataset_name} file not found at {dataset_path}. Skipping.")
        continue
    except Exception as e:
        print(f"An error occurred while loading {dataset_name}: {e}. Skipping.")
        continue

    try:
        # Get predictions and size information from your model
        predictions, original_size, compressed_size = your_model_inference_with_compression(df, vqvae_model)

        # Ensure ground truth and predictions have the same shape and are numeric for performance metrics
        input_data = df.select_dtypes(include=np.number).values
        try:
            ground_truth = input_data.astype(np.float32)
            predictions = predictions.astype(np.float32)

            # Adjust predictions shape to match ground_truth if necessary (e.g., if model outputs shorter sequences)
            min_rows = min(ground_truth.shape[0], predictions.shape[0])
            min_cols = min(ground_truth.shape[1], predictions.shape[1])
            ground_truth_subset = ground_truth[:min_rows, :min_cols]
            predictions_subset = predictions[:min_rows, :min_cols]

            # Compute performance metrics
            metrics = compute_performance(ground_truth_subset, predictions_subset)
            performance_results[dataset_name] = metrics

            print("Performance Metrics:")
            print(f"  Composition Error: {metrics.get('composition_error', 'N/A'):.4f}")
            print(f"  Reconstruction RMSE: {metrics.get('reconstruction_rmse', 'N/A'):.4f}")
            print(f"  Reconstruction MAE: {metrics.get('reconstruction_mae', 'N/A'):.4f}")
            print(f"  MAPE: {metrics.get('mape', 'N/A'):.4f}")

        except ValueError as e:
            print(f"  Could not compute performance for {dataset_name}: {e}")
            print("  Please ensure the output of your model has a compatible shape and data type for comparison with the ground truth.")
            metrics = {} # Store empty metrics if computation fails
        except Exception as e:
            print(f"  An unexpected error occurred while computing performance for {dataset_name}: {e}")
            metrics = {} # Store empty metrics if computation fails


        # Compute compression ratio
        compression_ratio = compute_compression_ratio(original_size, compressed_size)
        compression_results[dataset_name] = compression_ratio

        if compression_ratio is not None:
            print(f"  Compression Ratio: {compression_ratio:.2f}")

    except Exception as e:
        print(f"  An error occurred during model inference or compression for {dataset_name}: {e}")

# Print final summary in a structured table
print("\n--- Final Comparative Analysis of Performance Across Datasets ---")

results_df = pd.DataFrame(columns=[
    'Dataset',
    'Composition Error',
    'Reconstruction RMSE',
    'Reconstruction MAE',
    'MAPE',
    'Compression Ratio'
])

for dataset_name in dataset_info.keys():
    metrics = performance_results.get(dataset_name, {})
    comp_ratio = compression_results.get(dataset_name)

    new_row = {
        'Dataset': dataset_name,
        'Composition Error': metrics.get('composition_error'),
        'Reconstruction RMSE': metrics.get('reconstruction_rmse'),
        'Reconstruction MAE': metrics.get('restraction_mae'),
        'MAPE': metrics.get('mape'),
        'Compression Ratio': comp_ratio
    }
    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)

print(results_df.to_string(index=False))


--- Starting TimeVQVAE Experiment ---

--- Processing Dataset: ETTh1 ---
ETTh1 loaded successfully!
Performance Metrics:
  Composition Error: 4.8999
  Reconstruction RMSE: 6.5045
  Reconstruction MAE: 4.8999
  MAPE: 62514976.0000
  Compression Ratio: 7.00

--- Processing Dataset: ETTh2 ---
ETTh2 loaded successfully!
Performance Metrics:
  Composition Error: 24.1082
  Reconstruction RMSE: 30.3314
  Reconstruction MAE: 24.1082
  MAPE: 351125824.0000
  Compression Ratio: 7.00

--- Processing Dataset: ETTm1 ---
ETTm1 loaded successfully!
Performance Metrics:
  Composition Error: 4.9152
  Reconstruction RMSE: 6.5238
  Reconstruction MAE: 4.9152
  MAPE: 61533812.0000
  Compression Ratio: 7.00

--- Processing Dataset: ETTm2 ---
ETTm2 loaded successfully!
Performance Metrics:
  Composition Error: 24.1329
  Reconstruction RMSE: 30.3569
  Reconstruction MAE: 24.1329
  MAPE: 357303328.0000
  Compression Ratio: 7.00

--- Processing Dataset: Electricity ---
Electricity loaded successfully!
Perform

  results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
