<a href="https://colab.research.google.com/github/pmxfa/sp-shapely/blob/main/sp_timegan_electricity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install synthcity

Collecting synthcity
  Downloading synthcity-0.2.11-py3-none-any.whl.metadata (37 kB)
Collecting torch<2.3,>=2.1 (from synthcity)
  Downloading torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting nflows>=0.14 (from synthcity)
  Downloading nflows-0.14.tar.gz (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy<2.0,>=1.20 (from synthcity)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lifelines<0.30.0,>=0.29.0 (from synthcity)
  Downloading lifelines-0.29.0-py3-none-any.whl.metadata (3.2 kB)
Collecting opacus>=1.3 (from synthcity)
  Downloading opacus-1.5.3-py3-none-any.whl.metadata (8.4 kB)
Collecting networkx<3.0,>2.0 (from s

# Training

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import sys
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader

log.add(sink=sys.stderr, level="INFO")

Mounted at /content/drive
[KeOps] Compiling cuda jit compiler engine ... OK
[pyKeOps] Compiling nvrtc binder for python ... OK


In [2]:
file_path = "/content/drive/Shareddrives/sp_env/datasets/Electricity Transformer Dataset (ETDataset)/ETTh1.csv"

df = pd.read_csv(file_path)
print(df.head())
print(df.info())
print(df.isnull().sum())

                  date   HUFL   HULL   MUFL   MULL   LUFL   LULL         OT
0  2016-07-01 00:00:00  5.827  2.009  1.599  0.462  4.203  1.340  30.531000
1  2016-07-01 01:00:00  5.693  2.076  1.492  0.426  4.142  1.371  27.787001
2  2016-07-01 02:00:00  5.157  1.741  1.279  0.355  3.777  1.218  27.787001
3  2016-07-01 03:00:00  5.090  1.942  1.279  0.391  3.807  1.279  25.044001
4  2016-07-01 04:00:00  5.358  1.942  1.492  0.462  3.868  1.279  21.948000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17420 entries, 0 to 17419
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    17420 non-null  object 
 1   HUFL    17420 non-null  float64
 2   HULL    17420 non-null  float64
 3   MUFL    17420 non-null  float64
 4   MULL    17420 non-null  float64
 5   LUFL    17420 non-null  float64
 6   LULL    17420 non-null  float64
 7   OT      17420 non-null  float64
dtypes: float64(7), object(1)
memory usage: 1.1+ MB
None
date    0

In [3]:
# Convert 'date' to datetime, set as index, and sort
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.sort_index(inplace=True)

# Keep the latest 5000 rows
df_latest = df.tail(5000)

# Train-test split: 70% for training (for TimeGAN), 30% for testing (TSTR)
train_size = int(0.7 * len(df_latest))
df_train = df_latest.iloc[:train_size]
df_test = df_latest.iloc[train_size:]  # use later for LSTM-TSTR

# Normalize the data
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(df_train)
df_scaled_train = pd.DataFrame(scaled_train, columns=df_train.columns, index=df_train.index)
scaled_test = scaler.transform(df_test)
df_scaled_test = pd.DataFrame(scaled_train, columns=df_train.columns, index=df_train.index)

# Sequence length for time-series data (dataset = hourly; 24 hours)
sequence_length = 24
temporal_data = []
observation_times = []

# Generate sequences from df_scaled_train only
for start in range(len(df_scaled_train) - sequence_length + 1):
    sequence = df_scaled_train.iloc[start:start + sequence_length].reset_index(drop=True)
    temporal_data.append(sequence)
    observation_times.append(list(range(sequence_length)))  # relative time within the window

# Dummy outcome for TimeGAN (can be used in DataLoader)
dummy_outcome = pd.DataFrame(np.zeros(len(temporal_data)), columns=["outcome"])

# Create DataLoader for TimeGAN
loader = TimeSeriesDataLoader(
    temporal_data=temporal_data,
    observation_times=observation_times,
    static_data=None,
    outcome=dummy_outcome,
)

# Print the loader info
print(f"TimeSeriesDataLoader created with {len(temporal_data)} sequences")


TimeSeriesDataLoader created with 3477 sequences


In [None]:
print(len(df_train))
print(loader.dataframe())

3500
       seq_id  seq_time_id  seq_temporal_HUFL  seq_temporal_HULL  \
0           0            0           0.642388           0.302720   
1           0            1           0.613277           0.410782   
2           0            2           0.656955           0.427003   
3           0            3           0.647244           0.378339   
4           0            4           0.711963           0.416189   
...       ...          ...                ...                ...   
83443    3476           19           0.841426           0.362118   
83444    3476           20           0.865681           0.362118   
83445    3476           21           0.864062           0.437818   
83446    3476           22           0.930425           0.481075   
83447    3476           23           0.791250           0.405375   

       seq_temporal_LUFL  seq_temporal_LULL  seq_temporal_MUFL  \
0               0.429806           0.589646           0.659849   
1               0.429806           0.666667   

In [None]:
hparams = {
          "mode": "LSTM", # default mode = RNN
}

# Load TimeGAN with custom parameters
syn_model = Plugins().get("timegan", **hparams)

[2025-04-26T12:35:18.790696+0000][1314][CRITICAL] module disabled: /usr/local/lib/python3.11/dist-packages/synthcity/plugins/generic/plugin_goggle.py
[2025-04-26T12:35:18.790696+0000][1314][CRITICAL] module disabled: /usr/local/lib/python3.11/dist-packages/synthcity/plugins/generic/plugin_goggle.py


In [None]:
# Print all parameters of initialized model
for attr in dir(syn_model):
    if not attr.startswith("_") and not callable(getattr(syn_model, attr)):
        print(f"{attr}: {getattr(syn_model, attr)}")

batch_size: 64
class_name: TimeGANPlugin
clipping_value: 0
compress_dataset: False
dataloader_sampling_strategy: imbalanced_time_censoring
device: cuda
discriminator_batch_norm: False
discriminator_dropout: 0.1
discriminator_loss: None
discriminator_lr: 0.001
discriminator_n_iter: 1
discriminator_n_layers_hidden: 3
discriminator_n_units_hidden: 300
discriminator_nonlin: leaky_relu
discriminator_weight_decay: 0.001
embedding_penalty: 10
encoder: None
encoder_max_clusters: 20
expecting_conditional: False
fitted: False
gamma_penalty: 1
generator_batch_norm: False
generator_dropout: 0.01
generator_loss: None
generator_lr: 0.001
generator_n_layers_hidden: 2
generator_n_units_hidden: 150
generator_nonlin: leaky_relu
generator_nonlin_out_continuous: tanh
generator_nonlin_out_discrete: softmax
generator_residual: True
generator_weight_decay: 0.001
mode: LSTM
module_name: synthcity.plugins.time_series.plugin_timegan
module_relative_path: ../time_series/plugin_timegan.py
moments_penalty: 100
n_i

## fitting the model

In [None]:
print(loader.shape)

In [None]:
#  Train the model
syn_model.fit(loader)

100%|██████████| 1000/1000 [2:02:26<00:00,  7.35s/it]


<synthcity.plugins.time_series.plugin_timegan.TimeGANPlugin at 0x787b51703a90>

In [None]:
saved_model = syn_model.save()

In [None]:
from synthcity.utils.serialization import save_to_file, load_from_file

# Save model to drive
# save_to_file('/content/drive/Shareddrives/sp_env/test_model.pkl', syn_model)
save_to_file('/content/drive/Shareddrives/sp_env/saved_models/GAN_Electricity.pkl', syn_model)

# Load the model
# loaded_model = load_from_file('/content/drive/Shareddrives/sp_env/test_model.pkl')

In [None]:
n_samples = len(temporal_data)
syn_data = syn_model.generate(count=n_samples)
print(data.shape)

(1240, 10)


In [None]:
n_samples = len(temporal_data)
print(n_samples)
data = loaded_model.generate(count=n_samples)
print(data.shape)

3477
(43070, 10)


In [None]:
import inspect
print(inspect.getsource(syn_model.generate))

    @validate_arguments
    def generate(
        self,
        count: Optional[int] = None,
        constraints: Optional[Constraints] = None,
        random_state: Optional[int] = None,
        **kwargs: Any,
    ) -> DataLoader:
        """Synthetic data generation method.

        Args:
            count: optional int.
                The number of samples to generate. If None, it generated len(reference_dataset) samples.
            cond: Optional, Union[pd.DataFrame, pd.Series, np.ndarray].
                Optional Generation Conditional. The conditional can be used only if the model was trained using a conditional too.
                If provided, it must have `count` length.
                Not all models support conditionals. The conditionals can be used in VAEs or GANs to speed-up the generation under some constraints. For model agnostic solutions, check out the `constraints` parameter.
            constraints: optional Constraints.
                Optional constraints to app

In [None]:
n_samples = len(temporal_data)
print(n_samples)
data1 = loaded_model.generate(count=1)
data2 = loaded_model.generate(count=10)
data3 = loaded_model.generate(count=100)
print(data1.shape)
print(data2.shape)
print(data3.shape)

3477
(16, 10)
(134, 10)
(1294, 10)


In [None]:
# Save with automated format
import datetime
import os
# Get the current date and time
now = datetime.datetime.now()
timestamp = now.strftime("%m%d%y-%H%M%S")  # MMDDYY-HHMMSS format

# Define the base directory
base_dir = "/content/drive/Shareddrives/sp_env/synthetic_datasets/TimeGAN/electricity"  #CHANGE THIS
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Construct the filename
model_name = type(syn_model).__name__.lower() # Get model name dynamically
filename = f"{timestamp}-{model_name}-n_3000.csv"
filepath = os.path.join(base_dir, filename)

# Save the data
df_syn = syn_data.dataframe()
df_syn.to_csv(filepath, index=False)

print(f"Synthetic data saved to: {filepath}")

Synthetic data saved to: /content/drive/Shareddrives/sp_env/synthetic_datasets/TimeGAN/electricity/042625-152713-timeganplugin-n_3000.csv


# Evaluation

## Prerequisites

In [None]:
# Define selected columns explicitly
selected_columns = ['seq_temporal_HUFL', 'seq_temporal_HULL', 'seq_temporal_LUFL', 'seq_temporal_LULL', 'seq_temporal_MUFL', 'seq_temporal_MULL', 'seq_temporal_OT']

# Ensure real_data and synthetic_data only contain the selected columns
real_data = loader.dataframe()[selected_columns].to_numpy()
synthetic_data = syn_data.dataframe()[selected_columns].to_numpy()

In [None]:
#  Check datasets

print(real_data, "\n ------------------------------------------------------- \n", synthetic_data)
print(type(real_data),type(synthetic_data))
print(real_data.shape,synthetic_data.shape)

""" TODO
[] add adjusting off dataset to fit min length here
[] remove min length stuff in helper funcs
"""

[[0.64238779 0.30271972 0.42980559 ... 0.65984936 0.33017688 0.37748344]
 [0.61327729 0.41078203 0.42980559 ... 0.62082855 0.4031624  0.393583  ]
 [0.65695512 0.42700348 0.36400289 ... 0.675646   0.39360371 0.40562914]
 ...
 [0.86406245 0.43781779 0.6272138  ... 0.88011299 0.37779167 0.61446676]
 [0.93042472 0.48107499 0.77624185 ... 0.91636155 0.37779167 0.6626513 ]
 [0.79124994 0.40537487 0.38588913 ... 0.82715244 0.43487582 0.7309317 ]] 
 ------------------------------------------------------- 
 [[0.67353536 0.32652558 0.43057038 ... 0.83059878 0.33310791 0.42354418]
 [0.67358549 0.44509754 0.38212669 ... 0.83059413 0.41668498 0.50057416]
 [0.85121252 0.28399383 0.43061993 ... 0.8305551  0.37194391 0.42362864]
 ...
 [0.7319141  0.56629246 0.38196447 ... 0.86278558 0.37179222 0.50062887]
 [0.67342589 0.44475754 0.38193633 ... 0.79199183 0.33283287 0.50064705]
 [0.75042885 0.2835606  0.4799513  ... 0.83063244 0.33277079 0.37934677]]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(834

' TODO\n[] add adjusting off dataset to fit min length here\n[] remove min length stuff in helper funcs\n'

## Generate distance metrics

### Helper Functions

In [None]:
from scipy.stats import wasserstein_distance, entropy
import numpy as np

def compute_wasserstein(real_data, synthetic_data, selected_columns):
    """
    Computes Wasserstein Distance between real and synthetic time-series data.

    """

    # Ensure both datasets have the same number of samples
    min_length = min(len(real_data), len(synthetic_data))
    real_trimmed = real_data[:min_length]  # Keep original order (no random sampling)
    synthetic_trimmed = synthetic_data[:min_length]  # Match size
    print(real_trimmed.shape,synthetic_trimmed.shape)

    wasserstein_results = {}

    # Compute Wasserstein Distance for each feature
    for i, col in enumerate(selected_columns):
        w_dist = wasserstein_distance(real_trimmed[:, i], synthetic_trimmed[:, i])
        wasserstein_results[col] = w_dist
        print(f"{w_dist}")

    return wasserstein_results

def compute_kl_divergence(real_data, synthetic_data, selected_columns, bins=50):
    """
    Computes KL Divergence between real and synthetic time-series data.

    """

    # Ensure both datasets have the same number of samples
    min_length = min(len(real_data), len(synthetic_data))
    real_trimmed = real_data[:min_length]  # Keep original order
    synthetic_trimmed = synthetic_data[:min_length]  # Match size

    kl_results = {}

    for i, col in enumerate(selected_columns):
        # Compute histogram-based probability distributions
        real_hist, _ = np.histogram(real_trimmed[:, i], bins=bins, density=True)
        synth_hist, _ = np.histogram(synthetic_trimmed[:, i], bins=bins, density=True)

        # Avoid zero probabilities (KL Divergence is undefined for zero values)
        real_hist += 1e-10
        synth_hist += 1e-10

        # Compute KL Divergence
        kl_div = entropy(real_hist, synth_hist)
        kl_results[col] = kl_div
        print(f"{kl_div}")

    return kl_results

### Generate Metrics

In [None]:
# Compute Wasserstein Distance
wasserstein_results = compute_wasserstein(real_data, synthetic_data, selected_columns)
print("Wasserstein Distance Results:")
print(wasserstein_results)

# Compute KL Divergence
kl_results = compute_kl_divergence(real_data, synthetic_data, selected_columns)
print("KL Divergence Results:")
print(kl_results)

(42946, 7) (42946, 7)
0.05639391359444064
0.03758366863714886
0.09343952591018045
0.05978232416685537
0.08429052923356815
0.06523718684914633
0.053477183409293055
Wasserstein Distance Results:
{'seq_temporal_HUFL': 0.05639391359444064, 'seq_temporal_HULL': 0.03758366863714886, 'seq_temporal_LUFL': 0.09343952591018045, 'seq_temporal_LULL': 0.05978232416685537, 'seq_temporal_MUFL': 0.08429052923356815, 'seq_temporal_MULL': 0.06523718684914633, 'seq_temporal_OT': 0.053477183409293055}
10.40922091460919
12.911924636011225
12.882915120034097
10.578358954241521
11.11602725906944
14.83792841779404
14.23388670843781
KL Divergence Results:
{'seq_temporal_HUFL': 10.40922091460919, 'seq_temporal_HULL': 12.911924636011225, 'seq_temporal_LUFL': 12.882915120034097, 'seq_temporal_LULL': 10.578358954241521, 'seq_temporal_MUFL': 11.11602725906944, 'seq_temporal_MULL': 14.83792841779404, 'seq_temporal_OT': 14.23388670843781}


# LSTM downstream

In [10]:
real_data = df_scaled_test
df_synth = pd.read_csv("/content/drive/Shareddrives/sp_env/synthetic_datasets/TimeGAN/electricity/042625-152713-timeganplugin-n_3000.csv")

# 2. Drop the unwanted column
real_data = real_data.drop(columns=["seq_id", "seq_time_id", "seq_out_outcome"], errors="ignore")
df_synth = df_synth.drop(columns=["seq_id", "seq_time_id", "seq_out_outcome"], errors="ignore")

In [11]:
print(f"real_data: {real_data.shape}, synthetic_data: {df_synth.shape}")

real_data: (3500, 7), synthetic_data: (42946, 7)


In [14]:
#@title ✧.* libraries ✧.*

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

recheck LSTM notebook for

In [13]:
# Convert to tensors (float32 for PyTorch)
data_real = torch.tensor(real_data.values, dtype=torch.float32)
data_synth = torch.tensor(df_synth.values, dtype=torch.float32)

# ──────── Sequence builder ───────────
def make_sequences(data, seq_len):
    X, y = [], []
    for i in range(len(data) - seq_len):
        X.append(data[i:i+seq_len])
        y.append(data[i+seq_len])
    return torch.stack(X), torch.stack(y)

SEQ_LEN = sequence_length

# Sequences for synthetic (train)
X_train, y_train = make_sequences(data_synth, SEQ_LEN)
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)

# Sequences for real (test)
X_test, y_test = make_sequences(data_real, SEQ_LEN)


In [16]:
#@title ✧.* model definition and training ✧.*

# ─── Model Definition ──────────────────────────────────────
class ShallowLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=32):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, input_size)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)  # hn shape: (1, batch, hidden_size)
        out = self.linear(hn.squeeze(0))  # squeeze to (batch, hidden_size)
        return out


"""
Based on a study by Deswal and Kumar(2024), they indicated the following
hyperparameters to be the best case for the dataset in their study (stock price).

epoch = 200
batch_size = 32
neurons = 512
learning_rate = 0.002
dropout = 0.01

*did not use yet due to long training time
"""

class TunedLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=64, dropout=0.01):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, dropout=dropout, batch_first=True)
        self.linear = nn.Linear(hidden_size, input_size)

    def forward(self, x):
        output, _ = self.lstm(x)
        last = output[:, -1, :]  # take the last time step
        return self.linear(last)

# ─── Model Init ─────────────────────────────────────────────
# model = TunedLSTM(input_size=X_train.shape[2], hidden_size=64, dropout=0.01)
model = ShallowLSTM(input_size=X_train.shape[2], hidden_size=32)

# ─── Optimizer & Loss ───────────────────────────────────────
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

# ─── Training ───────────────────────────────────────────────
EPOCHS = 50
for epoch in range(1, EPOCHS + 1):
    model.train()
    for xb, yb in train_loader:
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # if epoch % 10 == 0 or epoch == 1:
    print(f"Epoch {epoch}: Train MSE = {loss.item():.6f}")

Epoch 1: Train MSE = 0.008179
Epoch 2: Train MSE = 0.006505
Epoch 3: Train MSE = 0.014976
Epoch 4: Train MSE = 0.007806
Epoch 5: Train MSE = 0.014187
Epoch 6: Train MSE = 0.007589
Epoch 7: Train MSE = 0.008203
Epoch 8: Train MSE = 0.015628
Epoch 9: Train MSE = 0.003408
Epoch 10: Train MSE = 0.004471
Epoch 11: Train MSE = 0.004497
Epoch 12: Train MSE = 0.004948
Epoch 13: Train MSE = 0.005701
Epoch 14: Train MSE = 0.006819
Epoch 15: Train MSE = 0.004917
Epoch 16: Train MSE = 0.011983
Epoch 17: Train MSE = 0.005745
Epoch 18: Train MSE = 0.008743
Epoch 19: Train MSE = 0.004548
Epoch 20: Train MSE = 0.017778
Epoch 21: Train MSE = 0.010611
Epoch 22: Train MSE = 0.012294
Epoch 23: Train MSE = 0.015670
Epoch 24: Train MSE = 0.004537
Epoch 25: Train MSE = 0.005124
Epoch 26: Train MSE = 0.005350
Epoch 27: Train MSE = 0.011767
Epoch 28: Train MSE = 0.016959
Epoch 29: Train MSE = 0.007853
Epoch 30: Train MSE = 0.009425
Epoch 31: Train MSE = 0.006724
Epoch 32: Train MSE = 0.007729
Epoch 33: Train M

In [19]:
#@title ✧.* model evaluation ✧.*
model.eval()
with torch.no_grad():
    preds = model(X_test)
    test_mse = loss_fn(preds, y_test).item()
    test_mae = mean_absolute_error(y_test.numpy(), preds.numpy())

    print(f"Test MSE: {test_mse:.6f}")
    print(f"Test MAE: {test_mae:.6f}")

Test MSE: 0.054213
Test MAE: 0.177336
