<a href="https://colab.research.google.com/github/pmxfa/sp-shapely/blob/main/sp_timevae_weather.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install synthcity

# Training

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import sys
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader
from synthcity.utils.serialization import save_to_file, load_from_file

log.add(sink=sys.stderr, level="INFO")

Mounted at /content/drive


In [3]:
# Define file path
file_path = "/content/drive/Shareddrives/sp_env/datasets/Weather/weather.csv"

df = pd.read_csv(file_path)
print(df.head())
print(df.info())
print(df.isnull().sum())

                  date  p (mbar)  T (degC)  Tpot (K)  Tdew (degC)  rh (%)  \
0  2020-01-01 00:10:00   1008.89      0.71    273.18        -1.33    86.1   
1  2020-01-01 00:20:00   1008.76      0.75    273.22        -1.44    85.2   
2  2020-01-01 00:30:00   1008.66      0.73    273.21        -1.48    85.1   
3  2020-01-01 00:40:00   1008.64      0.37    272.86        -1.64    86.3   
4  2020-01-01 00:50:00   1008.61      0.33    272.82        -1.50    87.4   

   VPmax (mbar)  VPact (mbar)  VPdef (mbar)  sh (g/kg)  ...  wv (m/s)  \
0          6.43          5.54          0.89       3.42  ...      1.02   
1          6.45          5.49          0.95       3.39  ...      0.43   
2          6.44          5.48          0.96       3.39  ...      0.61   
3          6.27          5.41          0.86       3.35  ...      1.11   
4          6.26          5.47          0.79       3.38  ...      0.49   

   max. wv (m/s)  wd (deg)  rain (mm)  raining (s)  SWDR (W/m�)  \
0           1.60     224.3     

In [4]:
# Set seed for reproducibility
np.random.seed(42)

# Convert 'date' to datetime and set as index (preserves chronological order)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.sort_index(inplace=True)

# Exclude 'date' from feature selection (it's now the index anyway)
columns = df.columns  # 'date' is no longer in df.columns
selected_features = np.random.choice(columns, size=10, replace=False)

print(f"Selected features: {selected_features}")

# Keep only the selected features
df = df[selected_features]

Selected features: ['p (mbar)' 'PAR (�mol/m�/s)' 'raining (s)' 'T (degC)' 'sh (g/kg)'
 'VPmax (mbar)' 'wv (m/s)' 'Tdew (degC)' 'max. PAR (�mol/m�/s)'
 'SWDR (W/m�)']


In [5]:
print(df.head())

                     p (mbar)  PAR (�mol/m�/s)  raining (s)  T (degC)  \
date                                                                    
2020-01-01 00:10:00   1008.89              0.0          0.0      0.71   
2020-01-01 00:20:00   1008.76              0.0          0.0      0.75   
2020-01-01 00:30:00   1008.66              0.0          0.0      0.73   
2020-01-01 00:40:00   1008.64              0.0          0.0      0.37   
2020-01-01 00:50:00   1008.61              0.0          0.0      0.33   

                     sh (g/kg)  VPmax (mbar)  wv (m/s)  Tdew (degC)  \
date                                                                  
2020-01-01 00:10:00       3.42          6.43      1.02        -1.33   
2020-01-01 00:20:00       3.39          6.45      0.43        -1.44   
2020-01-01 00:30:00       3.39          6.44      0.61        -1.48   
2020-01-01 00:40:00       3.35          6.27      1.11        -1.64   
2020-01-01 00:50:00       3.38          6.26      0.49        

In [8]:
# Keep the latest 5000 rows
df_latest = df.tail(5000)

# Train-test split: 70% for training, 30% for testing (TSTR)
train_size = int(0.7 * len(df_latest))
df_train = df_latest.iloc[:train_size]
df_test = df_latest.iloc[train_size:]  # use later for LSTM-TSTR

# Normalize the data
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(df_train)
df_scaled_train = pd.DataFrame(scaled_train, columns=df_train.columns, index=df_train.index)
scaled_test = scaler.transform(df_test)
df_scaled_test = pd.DataFrame(scaled_test, columns=df_test.columns, index=df_test.index)

# Sequence length for time-series data (dataset = hourly; 24 hours)
sequence_length = 36

In [None]:
temporal_data = []
observation_times = []

# Generate sequences from df_scaled_train only
for start in range(len(df_scaled_train) - sequence_length + 1):
    sequence = df_scaled_train.iloc[start:start + sequence_length].reset_index(drop=True)
    temporal_data.append(sequence)
    observation_times.append(list(range(sequence_length)))  # relative time within the window

dummy_outcome = pd.DataFrame(np.zeros(len(temporal_data)), columns=["outcome"])

loader = TimeSeriesDataLoader(
    temporal_data=temporal_data,
    observation_times=observation_times,
    static_data=None,
    outcome=dummy_outcome,
)

# Print the loader info
print(f"TimeSeriesDataLoader created with {len(temporal_data)} sequences")

In [9]:
temporal_data_test = []
observation_times_test = []

# Generate sequences from df_scaled_test only
for start in range(len(df_scaled_test) - sequence_length + 1):
    sequence = df_scaled_test.iloc[start:start + sequence_length].reset_index(drop=True)
    temporal_data_test.append(sequence)
    observation_times_test.append(list(range(sequence_length)))  # relative time within the window

# Dummy outcome for TimeGAN (can be used in DataLoader)
dummy_outcome = pd.DataFrame(np.zeros(len(temporal_data_test)), columns=["outcome"])

# Create DataLoader for TimeGAN
loader_test = TimeSeriesDataLoader(
    temporal_data=temporal_data_test,
    observation_times=observation_times_test,
    static_data=None,
    outcome=dummy_outcome,
)

# Print the loader info
print(f"TimeSeriesDataLoader TEST SET created with {len(temporal_data_test)} sequences")

TimeSeriesDataLoader TEST SET created with 1465 sequences


In [None]:
print(len(df_scaled_train))  # Check the length of the dataframe
print(loader.dataframe())

3500
        seq_id  seq_time_id  seq_temporal_PAR (�mol/m�/s)  \
0            0            0                      0.000000   
1            0            1                      0.000000   
2            0            2                      0.000000   
3            0            3                      0.000000   
4            0            4                      0.000000   
...        ...          ...                           ...   
124735    3464           31                      0.379520   
124736    3464           32                      0.396154   
124737    3464           33                      0.397787   
124738    3464           34                      0.451815   
124739    3464           35                      0.408922   

        seq_temporal_SWDR (W/m�)  seq_temporal_T (degC)  \
0                       0.000000               0.114042   
1                       0.000000               0.127230   
2                       0.000000               0.148177   
3                       0.

In [None]:
syn_model = Plugins().get("timevae")

[2025-05-01T08:18:03.503116+0000][1547][CRITICAL] module disabled: /usr/local/lib/python3.11/dist-packages/synthcity/plugins/generic/plugin_goggle.py
[2025-05-01T08:18:03.503116+0000][1547][CRITICAL] module disabled: /usr/local/lib/python3.11/dist-packages/synthcity/plugins/generic/plugin_goggle.py


In [None]:
# Print all parameters of initialized model
for attr in dir(syn_model):
    if not attr.startswith("_") and not callable(getattr(syn_model, attr)):
        print(f"{attr}: {getattr(syn_model, attr)}")

batch_size: 64
class_name: TimeVAEPlugin
clipping_value: 0
compress_dataset: False
decoder_batch_norm: False
decoder_dropout: 0.01
decoder_n_layers_hidden: 2
decoder_n_units_hidden: 150
decoder_nonlin: leaky_relu
decoder_nonlin_out_continuous: tanh
decoder_nonlin_out_discrete: softmax
decoder_residual: True
device: cuda
embedding_penalty: 10
encoder: None
encoder_batch_norm: False
encoder_dropout: 0.1
encoder_max_clusters: 20
encoder_n_layers_hidden: 3
encoder_n_units_hidden: 300
encoder_nonlin: leaky_relu
expecting_conditional: False
fitted: False
gamma_penalty: 1
lr: 0.001
mode: LSTM
module_name: synthcity.plugins.time_series.plugin_timevae
module_relative_path: ../time_series/plugin_timevae.py
moments_penalty: 100
n_iter: 1000
n_iter_print: 10
outcome_encoder: TabularEncoder(cat_encoder_params={'handle_unknown': 'ignore',
                                   'sparse_output': False},
               categorical_encoder='onehot',
               cont_encoder_params={'n_components': 20},
 

## fitting the model

In [None]:
print(loader.shape)
# Train the model
syn_model.fit(loader)

(124740, 13)


<synthcity.plugins.time_series.plugin_timevae.TimeVAEPlugin at 0x7f29e3a87f50>

In [None]:
save_to_file('/content/drive/Shareddrives/sp_env/saved_models/VAE_Weather.pkl', syn_model)

In [None]:
# --- Generate Synthetic Data ---
n_samples = len(temporal_data)
syn_data = syn_model.generate(count=n_samples)
print(syn_data.shape)

(124740, 13)


In [None]:
# --- Save with automated format ---
import datetime
import os
# Get the current date and time
now = datetime.datetime.now()
timestamp = now.strftime("%m%d%y-%H%M%S")  # MMDDYY-HHMMSS format

# Define the base directory
base_dir = "/content/drive/Shareddrives/sp_env/synthetic_datasets/TimeVAE/weather"  #CHANGE THIS
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Construct the filename
model_name = type(syn_model).__name__.lower() # Get model name dynamically
filename = f"{timestamp}-{model_name}-n_3000.csv"
filepath = os.path.join(base_dir, filename)

# Save the data
df_syn = syn_data.dataframe()
df_syn.to_csv(filepath, index=False)

print(f"Synthetic data saved to: {filepath}")

Synthetic data saved to: /content/drive/Shareddrives/sp_env/synthetic_datasets/TimeVAE/weather/050125-085757-timevaeplugin-n_3000.csv


# Evaluation

## Prerequisites

In [11]:
syn_data = pd.read_csv('/content/drive/Shareddrives/sp_env/synthetic_datasets/TimeVAE/weather/050125-085757-timevaeplugin-n_3000.csv')

In [13]:
selected_columns = ['seq_temporal_PAR (�mol/m�/s)', 'seq_temporal_SWDR (W/m�)', 'seq_temporal_T (degC)',	'seq_temporal_Tdew (degC)',	'seq_temporal_VPmax (mbar)',	'seq_temporal_max. PAR (�mol/m�/s)',	'seq_temporal_p (mbar)',	'seq_temporal_raining (s)',	'seq_temporal_sh (g/kg)',	'seq_temporal_wv (m/s)']
# Ensure real_data and synthetic_data only contain the selected columns
real_data = loader_test.dataframe()[selected_columns].to_numpy()
synthetic_data = syn_data[selected_columns].to_numpy()

In [14]:
print(real_data, "\n ------------------------------------------------------- \n", synthetic_data)
print(type(real_data),type(synthetic_data))
print(real_data.shape,synthetic_data.shape)

[[0.46769315 0.44037457 0.61830877 ... 0.         0.69210526 0.26002766]
 [0.48242026 0.46101414 0.64701319 ... 0.         0.70789474 0.23651452]
 [0.38726973 0.36198879 0.67416602 ... 0.         0.71315789 0.31258645]
 ...
 [0.         0.         0.28704422 ... 0.         0.28157895 0.03042877]
 [0.         0.         0.29325058 ... 0.         0.27894737 0.05670816]
 [0.         0.         0.28859581 ... 0.         0.28684211 0.05670816]] 
 ------------------------------------------------------- 
 [[0.55969194 0.16735992 0.63528333 ... 0.05576173 0.38321587 0.24738849]
 [0.0497281  0.18382783 0.19698515 ... 0.26311472 0.7505566  0.42824979]
 [0.180474   0.16735992 0.26041931 ... 0.51152689 0.49017772 0.38798876]
 ...
 [0.11980377 0.14982813 0.11016495 ... 0.03436267 0.03389352 0.04138451]
 [0.31936363 0.44767405 0.77762817 ... 0.05091336 0.5704801  0.5044388 ]
 [0.180474   0.44767405 0.4841849  ... 0.26311472 0.7505566  0.1886928 ]]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(527

## Generate distance metrics

### Helper Functions

In [15]:
from scipy.stats import wasserstein_distance, entropy
import numpy as np

def compute_wasserstein(real_data, synthetic_data, selected_columns):
    """
    Computes Wasserstein Distance between real and synthetic time-series data.

    """

    # Ensure both datasets have the same number of samples
    min_length = min(len(real_data), len(synthetic_data))
    real_trimmed = real_data[:min_length]  # Keep original order (no random sampling)
    synthetic_trimmed = synthetic_data[:min_length]  # Match size
    print(real_trimmed.shape,synthetic_trimmed.shape)

    wasserstein_results = {}

    # Compute Wasserstein Distance for each feature
    for i, col in enumerate(selected_columns):
        w_dist = wasserstein_distance(real_trimmed[:, i], synthetic_trimmed[:, i])
        wasserstein_results[col] = w_dist
        print(f"{w_dist}")

    return wasserstein_results

def compute_kl_divergence(real_data, synthetic_data, selected_columns, bins=50):
    """
    Computes KL Divergence between real and synthetic time-series data.

    """

    # Ensure both datasets have the same number of samples
    min_length = min(len(real_data), len(synthetic_data))
    real_trimmed = real_data[:min_length]  # Keep original order
    synthetic_trimmed = synthetic_data[:min_length]  # Match size

    kl_results = {}

    for i, col in enumerate(selected_columns):
        # Compute histogram-based probability distributions
        real_hist, _ = np.histogram(real_trimmed[:, i], bins=bins, density=True)
        synth_hist, _ = np.histogram(synthetic_trimmed[:, i], bins=bins, density=True)

        # Avoid zero probabilities (KL Divergence is undefined for zero values)
        real_hist += 1e-10
        synth_hist += 1e-10

        # Compute KL Divergence
        kl_div = entropy(real_hist, synth_hist)
        kl_results[col] = kl_div
        print(f"{kl_div}")

    return kl_results

### Generate Metrics

In [16]:
# Compute Wasserstein Distance
wasserstein_results = compute_wasserstein(real_data, synthetic_data, selected_columns)
print("Wasserstein Distance Results:")
print(wasserstein_results)

# Compute KL Divergence
kl_results = compute_kl_divergence(real_data, synthetic_data, selected_columns)
print("KL Divergence Results:")
print(kl_results)

(52740, 10) (52740, 10)
0.24981003213083094
0.28353058683631027
0.09160863536946366
0.11856572717572467
0.11013718754506546
0.24163104106897185
0.14526340791163445
0.1811604927996483
0.15202087792216107
0.041322533285129845
Wasserstein Distance Results:
{'seq_temporal_PAR (�mol/m�/s)': 0.24981003213083094, 'seq_temporal_SWDR (W/m�)': 0.28353058683631027, 'seq_temporal_T (degC)': 0.09160863536946366, 'seq_temporal_Tdew (degC)': 0.11856572717572467, 'seq_temporal_VPmax (mbar)': 0.11013718754506546, 'seq_temporal_max. PAR (�mol/m�/s)': 0.24163104106897185, 'seq_temporal_p (mbar)': 0.14526340791163445, 'seq_temporal_raining (s)': 0.1811604927996483, 'seq_temporal_sh (g/kg)': 0.15202087792216107, 'seq_temporal_wv (m/s)': 0.041322533285129845}
5.268580053218712
5.564311541865116
12.101990757334757
13.765908726211713
11.555151386810762
5.111432633239509
14.609472094964927
3.540381930675472
14.67597454699323
12.42613522855812
KL Divergence Results:
{'seq_temporal_PAR (�mol/m�/s)': 5.2685800532

# LSTM downstream

In [19]:
filepath = "/content/drive/Shareddrives/sp_env/synthetic_datasets/TimeVAE/weather/050125-085757-timevaeplugin-n_3000.csv"

In [23]:
real_data = loader_test.dataframe()
df_synth = pd.read_csv(filepath)

# Drop the unwanted column
real_data = real_data.drop(columns=["seq_id", "seq_time_id", "seq_out_outcome"], errors="ignore")
df_synth = df_synth.drop(columns=["seq_id", "seq_time_id", "seq_out_outcome"], errors="ignore")


In [24]:
print(f"real_data: {real_data.shape}, synthetic_data: {df_synth.shape}")

real_data: (52740, 10), synthetic_data: (124740, 10)


In [25]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [26]:
# Convert to tensors (float32 for PyTorch)
data_real = torch.tensor(real_data.values, dtype=torch.float32)
data_synth = torch.tensor(df_synth.values, dtype=torch.float32)

# ──────── Sequence builder ───────────
def make_sequences(data, seq_len):
    X, y = [], []
    for i in range(len(data) - seq_len):
        X.append(data[i:i+seq_len])
        y.append(data[i+seq_len])
    return torch.stack(X), torch.stack(y)

SEQ_LEN = sequence_length

# Sequences for synthetic (train)
X_train, y_train = make_sequences(data_synth, SEQ_LEN)
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)

# Sequences for real (test)
X_test, y_test = make_sequences(data_real, SEQ_LEN)

In [27]:
# ─── Model Definition ──────────────────────────────────────
class ShallowLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, input_size)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)  # hn shape: (1, batch, hidden_size)
        out = self.linear(hn.squeeze(0))  # squeeze to (batch, hidden_size)
        return out


# ─── Model Init ─────────────────────────────────────────────
model = ShallowLSTM(input_size=X_train.shape[2], hidden_size=64)

# ─── Optimizer & Loss ───────────────────────────────────────
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

In [28]:
# ─── Training ───────────────────────────────────────────────
EPOCHS = 50
for epoch in range(1, EPOCHS + 1):
    model.train()
    for xb, yb in train_loader:
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # if epoch % 10 == 0 or epoch == 1:
    print(f"Epoch {epoch}: Train MSE = {loss.item():.6f}")

Epoch 1: Train MSE = 0.061166
Epoch 2: Train MSE = 0.055942
Epoch 3: Train MSE = 0.062349
Epoch 4: Train MSE = 0.055855
Epoch 5: Train MSE = 0.056223
Epoch 6: Train MSE = 0.061403
Epoch 7: Train MSE = 0.055855
Epoch 8: Train MSE = 0.058057
Epoch 9: Train MSE = 0.054990
Epoch 10: Train MSE = 0.059917
Epoch 11: Train MSE = 0.058516
Epoch 12: Train MSE = 0.055999
Epoch 13: Train MSE = 0.060728
Epoch 14: Train MSE = 0.062357
Epoch 15: Train MSE = 0.054606
Epoch 16: Train MSE = 0.051315
Epoch 17: Train MSE = 0.057704
Epoch 18: Train MSE = 0.064060
Epoch 19: Train MSE = 0.057147
Epoch 20: Train MSE = 0.055788
Epoch 21: Train MSE = 0.060213
Epoch 22: Train MSE = 0.053340
Epoch 23: Train MSE = 0.055352
Epoch 24: Train MSE = 0.054120
Epoch 25: Train MSE = 0.053724
Epoch 26: Train MSE = 0.063190
Epoch 27: Train MSE = 0.056931
Epoch 28: Train MSE = 0.061038
Epoch 29: Train MSE = 0.053703
Epoch 30: Train MSE = 0.055803
Epoch 31: Train MSE = 0.052391
Epoch 32: Train MSE = 0.060201
Epoch 33: Train M

In [31]:
# --- ADD THESE LINES TO SAVE THE MODEL ---
# Define a path where you want to save your model.
# Use a meaningful name, especially for TRTR vs. TSTR models.
# Example for TRTR Electricity model:
MODEL_SAVE_PATH = '/content/drive/Shareddrives/sp_env/saved_models/LSTM/tstr_VAE_weather.pth'

# Save only the model's learned parameters (state_dict)
torch.save(model.state_dict(), MODEL_SAVE_PATH)

print(f"Model saved to: {MODEL_SAVE_PATH}")

Model saved to: /content/drive/Shareddrives/sp_env/saved_models/LSTM/tstr_VAE_weather.pth


In [32]:
model.eval()
with torch.no_grad():
    preds = model(X_test)
    test_mse = loss_fn(preds, y_test).item()
    test_mae = mean_absolute_error(y_test.numpy(), preds.numpy())

    print(f"Test MSE: {test_mse:.6f}")
    print(f"Test MAE: {test_mae:.6f}")

Test MSE: 1.882251
Test MAE: 0.790721


# Bootstrapping Sample

In [33]:
!pip install tsbootstrap

Collecting tsbootstrap
  Downloading tsbootstrap-0.1.5-py3-none-any.whl.metadata (28 kB)
Collecting scikit-base<0.11,>=0.10.0 (from tsbootstrap)
  Downloading scikit_base-0.10.1-py3-none-any.whl.metadata (8.6 kB)
Collecting scikit-learn<1.6.0,>=1.5.1 (from tsbootstrap)
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting scipy<1.14.0,>=1.13 (from tsbootstrap)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting packaging<24.2,>=24.0 (from tsbootstrap)
  Downloading packaging-24.1-py3-none-any.whl.metadata (3.2 kB)
Downloading tsbootstrap-0.1.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.4/104.4 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-24.1-py3-none-any.whl 

In [34]:
len(loader_test)

52740

In [35]:
from tsbootstrap import MovingBlockBootstrap
import numpy as np

C_CONSTANT = 1
n = len(loader_test)
calculated_block_length = int(np.round(C_CONSTANT * (n**(1/3))))
print(calculated_block_length)

bootstrap_configs = {
    "weather": {"block_length": calculated_block_length, "n_bootstraps": 15, "rng": 42},       # 6-hour pattern (10-min interval)
    "electricity": {"block_length": calculated_block_length, "n_bootstraps": 15, "rng": 42},   # 1-day pattern (hourly)
    "exchange": {"block_length": calculated_block_length, "n_bootstraps": 15, "rng": 42},      # 1-month pattern (daily)
}

# Example for weather
dataset_name = "weather"
config = bootstrap_configs[dataset_name]

real_test_array = real_data # shape (N, features)
mbb = MovingBlockBootstrap(
    n_bootstraps=config["n_bootstraps"],
    rng=config["rng"],
    block_length=config["block_length"]
)
boot_samples = mbb.bootstrap(real_test_array, return_indices=False)


38


In [36]:
model.load_state_dict(torch.load(MODEL_SAVE_PATH))

<All keys matched successfully>

In [37]:
bootstrap_results = []

for b_idx, boot_real in enumerate(boot_samples):
    # 1. Match the synthetic data size
    syn_trimmed = synthetic_data[:len(boot_real)]

    # 2. Fidelity metrics
    wasserstein = compute_wasserstein(boot_real, syn_trimmed, selected_columns)
    kl = compute_kl_divergence(boot_real, syn_trimmed, selected_columns)

    # 3. Utility metrics
    # Preprocess this bootstrap sample for LSTM (as you do with real_data)
    boot_tensor = torch.tensor(boot_real, dtype=torch.float32)
    Xb_test, yb_test = make_sequences(boot_tensor, SEQ_LEN)

    model.eval()
    with torch.no_grad():
        preds = model(Xb_test)
        mse = mean_squared_error(yb_test.numpy(), preds.numpy())
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(yb_test.numpy(), preds.numpy())

    # 4. Store results
    bootstrap_results.append({
        'bootstrap': b_idx,
        'wasserstein': np.mean(list(wasserstein.values())),
        'kl': np.mean(list(kl.values())),
        'rmse': rmse,
        'mae': mae
    })

(52740, 10) (52740, 10)
0.25095536882718555
0.28452329050581054
0.089212273592277
0.12166978935820838
0.10779443666841812
0.24367702579946254
0.13658289540529348
0.18256764238113235
0.1560823433790404
0.04325150893570941
5.216476338599544
5.460563913327727
12.132889929384941
13.739845221004344
11.619473201549372
5.038200381631906
14.783923282185464
3.545954995414707
14.70805425991151
12.493572459591393
(52740, 10) (52740, 10)
0.24780539888674294
0.2815904049875138
0.0940305232780793
0.12003043631291334
0.11263840588784504
0.2391776884270594
0.13559983042905446
0.18384636206108992
0.15484796506917917
0.04093625965273642
5.282522846344514
5.607556477639885
11.891710619365593
13.922695701139867
11.622126425974631
5.135746920218457
14.696639247439643
3.5216877986578146
14.614887786726946
12.38232584602225
(52740, 10) (52740, 10)
0.24915798403635409
0.2829310019481565
0.09258933632911064
0.11962624668815999
0.11171398756207412
0.2404106864431182
0.14444004243452024
0.18058843600811839
0.153

In [38]:
print(bootstrap_results)

# Assuming bootstrap_results is your list of dicts
df_results = pd.DataFrame(bootstrap_results)

df_results['Dataset'] = 'weather'
df_results['Model'] = 'TimeVAE'

df_results

[{'bootstrap': 0, 'wasserstein': 0.16163165748525377, 'kl': 9.87389539826009, 'rmse': 1.4062040109531893, 'mae': 0.847324013710022}, {'bootstrap': 1, 'wasserstein': 0.16105032749922138, 'kl': 9.867789966952959, 'rmse': 1.3861196163904408, 'mae': 0.830904483795166}, {'bootstrap': 2, 'wasserstein': 0.16168182993849556, 'kl': 9.869687713910759, 'rmse': 1.4549362215831014, 'mae': 0.8839032053947449}, {'bootstrap': 3, 'wasserstein': 0.1594361169461128, 'kl': 9.942386868961815, 'rmse': 1.4087875359856297, 'mae': 0.8493793606758118}, {'bootstrap': 4, 'wasserstein': 0.16303395891481512, 'kl': 9.831864317357596, 'rmse': 1.4930062170033385, 'mae': 0.9202097654342651}, {'bootstrap': 5, 'wasserstein': 0.16229687559276237, 'kl': 9.817737747691567, 'rmse': 1.4442484469993921, 'mae': 0.8801453709602356}, {'bootstrap': 6, 'wasserstein': 0.1637355658241896, 'kl': 9.88757556381402, 'rmse': 1.47881343366915, 'mae': 0.9083816409111023}, {'bootstrap': 7, 'wasserstein': 0.1614391203706083, 'kl': 9.795193093

Unnamed: 0,bootstrap,wasserstein,kl,rmse,mae,Dataset,Model
0,0,0.161632,9.873895,1.406204,0.847324,weather,TimeVAE
1,1,0.16105,9.86779,1.38612,0.830904,weather,TimeVAE
2,2,0.161682,9.869688,1.454936,0.883903,weather,TimeVAE
3,3,0.159436,9.942387,1.408788,0.849379,weather,TimeVAE
4,4,0.163034,9.831864,1.493006,0.92021,weather,TimeVAE
5,5,0.162297,9.817738,1.444248,0.880145,weather,TimeVAE
6,6,0.163736,9.887576,1.478813,0.908382,weather,TimeVAE
7,7,0.161439,9.795193,1.426023,0.863825,weather,TimeVAE
8,8,0.163829,9.905126,1.438447,0.871815,weather,TimeVAE
9,9,0.162378,9.864761,1.431677,0.868296,weather,TimeVAE


In [39]:
summary_row = {
    'Dataset': 'weather',
    'Model': 'TimeVAE',
    'Wasserstein': df_results['wasserstein'].mean(),
    'KL': df_results['kl'].mean(),
    'RMSE': df_results['rmse'].mean(),
    'MAE': df_results['mae'].mean()
}

df_summary = pd.DataFrame([summary_row])
print(df_summary)

       Dataset    Model  Wasserstein        KL     RMSE       MAE
0  electricity  TimeVAE     0.162178  9.847493  1.44007  0.874527
