<a href="https://colab.research.google.com/github/pmxfa/sp-shapely/blob/main/sp_timegan_weather.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training

In [None]:
!pip install synthcity

Collecting synthcity
  Downloading synthcity-0.2.11-py3-none-any.whl.metadata (37 kB)
Collecting torch<2.3,>=2.1 (from synthcity)
  Downloading torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting nflows>=0.14 (from synthcity)
  Downloading nflows-0.14.tar.gz (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy<2.0,>=1.20 (from synthcity)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lifelines<0.30.0,>=0.29.0 (from synthcity)
  Downloading lifelines-0.29.0-py3-none-any.whl.metadata (3.2 kB)
Collecting opacus>=1.3 (from synthcity)
  Downloading opacus-1.5.3-py3-none-any.whl.metadata (8.4 kB)
Collecting networkx<3.0,>2.0 (from s

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader
from synthcity.utils.serialization import save_to_file, load_from_file

log.add(sink=sys.stderr, level="INFO")

Mounted at /content/drive
[KeOps] Compiling cuda jit compiler engine ... OK
[pyKeOps] Compiling nvrtc binder for python ... OK


In [None]:
# Define file path
file_path = "/content/drive/Shareddrives/sp_env/datasets/Weather/weather.csv"

df = pd.read_csv(file_path)
print(df.head())
print(df.info())
print('missing values: \n', df.isnull().sum())

                  date  p (mbar)  T (degC)  Tpot (K)  Tdew (degC)  rh (%)  \
0  2020-01-01 00:10:00   1008.89      0.71    273.18        -1.33    86.1   
1  2020-01-01 00:20:00   1008.76      0.75    273.22        -1.44    85.2   
2  2020-01-01 00:30:00   1008.66      0.73    273.21        -1.48    85.1   
3  2020-01-01 00:40:00   1008.64      0.37    272.86        -1.64    86.3   
4  2020-01-01 00:50:00   1008.61      0.33    272.82        -1.50    87.4   

   VPmax (mbar)  VPact (mbar)  VPdef (mbar)  sh (g/kg)  ...  wv (m/s)  \
0          6.43          5.54          0.89       3.42  ...      1.02   
1          6.45          5.49          0.95       3.39  ...      0.43   
2          6.44          5.48          0.96       3.39  ...      0.61   
3          6.27          5.41          0.86       3.35  ...      1.11   
4          6.26          5.47          0.79       3.38  ...      0.49   

   max. wv (m/s)  wd (deg)  rain (mm)  raining (s)  SWDR (W/m�)  \
0           1.60     224.3     

In [None]:
# Set seed for reproducibility
np.random.seed(42)

# Convert 'date' to datetime and set as index (preserves chronological order)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.sort_index(inplace=True)

# Exclude 'date' from feature selection (it's now the index anyway)
columns = df.columns  # 'date' is no longer in df.columns
selected_features = np.random.choice(columns, size=10, replace=False)

print(f"Selected features: {selected_features}")

# Keep only the selected features
df = df[selected_features]

Selected features: ['p (mbar)' 'PAR (�mol/m�/s)' 'raining (s)' 'T (degC)' 'sh (g/kg)'
 'VPmax (mbar)' 'wv (m/s)' 'Tdew (degC)' 'max. PAR (�mol/m�/s)'
 'SWDR (W/m�)']


In [None]:
# Keep the latest 5000 rows
df_latest = df.tail(5000)

# Train-test split: 70% for training (for TimeGAN), 30% for testing (TSTR)
train_size = int(0.7 * len(df_scaled))
df_train = df_scaled.iloc[:train_size]
df_test = df_scaled.iloc[train_size:]  # you might use this later for TSTR

# Normalize the data using StandardScaler
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(df_train)
df_scaled_train = pd.DataFrame(scaled_train, columns=df_train.columns, index=df_train.index)
scaled_test = scaler.transform(df_test)
df_scaled_test = pd.DataFrame(scaled_test, columns=df_test.columns, index=df_test.index)

# Sequence length for time-series data (dataset = per 10 mins, 36 = 6 hrs)
sequence_length = 36
temporal_data = []
observation_times = []

# Generate sequences from df_train only
for start in range(len(df_train) - sequence_length + 1):
    sequence = df_train.iloc[start:start + sequence_length].reset_index(drop=True)
    temporal_data.append(sequence)
    observation_times.append(list(range(sequence_length)))  # relative time within the window

# Dummy outcome for TimeGAN (can be used in DataLoader)
dummy_outcome = pd.DataFrame(np.zeros(len(temporal_data)), columns=["outcome"])

# --- Create DataLoader for TimeGAN ---
loader = TimeSeriesDataLoader(
    temporal_data=temporal_data,
    observation_times=observation_times,
    static_data=None,
    outcome=dummy_outcome,
)

# Print the loader info
print(f"TimeSeriesDataLoader created with {len(temporal_data)} sequences")

TimeSeriesDataLoader created with 3465 sequences


In [None]:
print(len(df_scaled_train))  # Check the length of the dataframe
print(loader.dataframe())

5000
        seq_id  seq_time_id  seq_temporal_PAR (�mol/m�/s)  \
0            0            0                     -0.489582   
1            0            1                     -0.489582   
2            0            2                     -0.489582   
3            0            3                     -0.489582   
4            0            4                     -0.489582   
...        ...          ...                           ...   
124735    3464           31                      1.686200   
124736    3464           32                      1.781566   
124737    3464           33                      1.790925   
124738    3464           34                      2.100668   
124739    3464           35                      1.854765   

        seq_temporal_SWDR (W/m�)  seq_temporal_T (degC)  \
0                      -0.476489              -1.140307   
1                      -0.476489              -1.082832   
2                      -0.476489              -0.991550   
3                      -0.

In [None]:
from synthcity.plugins import Plugins

hparams = {
          "mode": "LSTM", # default mode = RNN;
}

# Load TimeGAN with custom parameters
syn_model = Plugins().get("timegan", **hparams)

[2025-04-17T03:42:33.277088+0000][2204][CRITICAL] module disabled: /usr/local/lib/python3.11/dist-packages/synthcity/plugins/generic/plugin_goggle.py


In [None]:
# --- Print all parameters of initialized model ---
for attr in dir(syn_model):
    if not attr.startswith("_") and not callable(getattr(syn_model, attr)):
        print(f"{attr}: {getattr(syn_model, attr)}")

batch_size: 64
class_name: TimeGANPlugin
clipping_value: 0
compress_dataset: False
dataloader_sampling_strategy: imbalanced_time_censoring
device: cuda
discriminator_batch_norm: False
discriminator_dropout: 0.1
discriminator_loss: None
discriminator_lr: 0.001
discriminator_n_iter: 1
discriminator_n_layers_hidden: 3
discriminator_n_units_hidden: 300
discriminator_nonlin: leaky_relu
discriminator_weight_decay: 0.001
embedding_penalty: 10
encoder: None
encoder_max_clusters: 20
expecting_conditional: False
fitted: False
gamma_penalty: 1
generator_batch_norm: False
generator_dropout: 0.01
generator_loss: None
generator_lr: 0.001
generator_n_layers_hidden: 2
generator_n_units_hidden: 150
generator_nonlin: leaky_relu
generator_nonlin_out_continuous: tanh
generator_nonlin_out_discrete: softmax
generator_residual: True
generator_weight_decay: 0.001
mode: LSTM
module_name: synthcity.plugins.time_series.plugin_timegan
module_relative_path: ../time_series/plugin_timegan.py
moments_penalty: 100
n_i

## fitting the model

In [None]:
# --- Train the model ---
syn_model.fit(loader)

100%|██████████| 1000/1000 [2:49:36<00:00, 10.18s/it]


<synthcity.plugins.time_series.plugin_timegan.TimeGANPlugin at 0x797e9b5063d0>

In [None]:
# --- Generate Synthetic Data ---
n_samples = len(temporal_data)
syn_data = syn_model.generate(count=n_samples)
print(syn_data.shape)

(113130, 13)


In [None]:
# --- Save with automated format ---
import datetime
import os
# Get the current date and time
now = datetime.datetime.now()
timestamp = now.strftime("%m%d%y-%H%M%S")  # MMDDYY-HHMMSS format

# Define the base directory
base_dir = "/content/drive/Shareddrives/sp_env/synthetic_datasets/TimeGAN/weather"  #CHANGE THIS
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Construct the filename
model_name = type(syn_model).__name__.lower() # Get model name dynamically
filename = f"{timestamp}-{model_name}-n_3000.csv"
filepath = os.path.join(base_dir, filename)

# Save the data
df_syn = syn_data.dataframe()
df_syn.to_csv(filepath, index=False)

print(f"Synthetic data saved to: {filepath}")

Synthetic data saved to: /content/drive/Shareddrives/sp_env/synthetic_datasets/TimeGAN/weather/041725-064438-timeganplugin-n_3000.csv


# Evaluation

## Prerequisites

In [None]:
loader

Unnamed: 0,seq_id,seq_time_id,seq_temporal_PAR (�mol/m�/s),seq_temporal_SWDR (W/m�),seq_temporal_T (degC),seq_temporal_Tdew (degC),seq_temporal_VPmax (mbar),seq_temporal_max. PAR (�mol/m�/s),seq_temporal_p (mbar),seq_temporal_raining (s),seq_temporal_sh (g/kg),seq_temporal_wv (m/s),seq_out_outcome
0,0,0,-0.489582,-0.476489,-1.140307,-0.513990,-0.984883,-0.487936,0.970766,-0.246097,-0.583301,-0.756261,0.0
1,0,1,-0.489582,-0.476489,-1.082832,-0.489481,-0.941400,-0.487936,0.970766,-0.246097,-0.565085,-1.165378,0.0
2,0,2,-0.489582,-0.476489,-0.991550,-0.468034,-0.876175,-0.487936,0.979837,-0.246097,-0.546869,-1.187295,0.0
3,0,3,-0.489582,-0.476489,-0.981407,-0.468034,-0.870739,-0.487936,0.981651,-0.246097,-0.546869,-0.873152,0.0
4,0,4,-0.489582,-0.476489,-0.984788,-0.446588,-0.870739,-0.487936,0.977116,-0.246097,-0.528653,-0.997348,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
124735,3464,31,1.686200,1.559720,0.874675,0.824865,0.743581,1.584735,1.028819,-0.246097,0.673597,0.719484,0.0
124736,3464,32,1.781566,1.664691,0.878056,0.830992,0.743581,1.629585,1.027005,-0.246097,0.682705,0.763318,0.0
124737,3464,33,1.790925,1.681775,0.881437,0.837120,0.749016,1.691363,1.023376,-0.246097,0.682705,0.595288,0.0
124738,3464,34,2.100668,2.023452,0.932150,0.861630,0.803371,1.918709,1.022469,-0.246097,0.710029,-0.025694,0.0


In [None]:
# Define selected columns explicitly
selected_columns = ['seq_temporal_PAR (�mol/m�/s)', 'seq_temporal_SWDR (W/m�)', 'seq_temporal_T (degC)',	'seq_temporal_Tdew (degC)',	'seq_temporal_VPmax (mbar)',	'seq_temporal_max. PAR (�mol/m�/s)',	'seq_temporal_p (mbar)',	'seq_temporal_raining (s)',	'seq_temporal_sh (g/kg)',	'seq_temporal_wv (m/s)']
# Ensure real_data and synthetic_data only contain the selected columns
real_data = loader.dataframe()[selected_columns].to_numpy()
synthetic_data = syn_data.dataframe()[selected_columns].to_numpy()

In [None]:
print(real_data, "\n ------------------------------------------------------- \n", synthetic_data)
print(type(real_data),type(synthetic_data))
print(real_data.shape,synthetic_data.shape)

[[-0.48958181 -0.47648878 -1.14030659 ... -0.24609724 -0.58330095
  -0.75626087]
 [-0.48958181 -0.47648878 -1.08283228 ... -0.24609724 -0.56508504
  -1.16537829]
 [-0.48958181 -0.47648878 -0.99154955 ... -0.24609724 -0.54686912
  -1.1872953 ]
 ...
 [ 1.79092525  1.68177474  0.88143688 ... -0.24609724  0.68270521
   0.59528777]
 [ 2.10066753  2.02345234  0.93214951 ... -0.24609724  0.71002908
  -0.02569404]
 [ 1.85476526  1.78731515  0.99976635 ... -0.24609724  0.74646091
   0.19347601]] 
 ------------------------------------------------------- 
 [[-0.48949689 -0.47648878  1.22456085 ... -0.24609724 -0.01670865
  -0.74878515]
 [-0.48948907 -0.47648878 -0.36324033 ... -0.24609724  0.10520755
   0.36740728]
 [-0.48948194 -0.47648878 -0.60093091 ... -0.24609724  0.15317317
  -0.93012676]
 ...
 [-0.48934068 -0.47648878  1.21858415 ... -0.24609724  0.62887574
  -0.13767291]
 [-0.48936171 -0.47648878 -0.4905377  ... -0.24609724  0.62858938
  -1.05388676]
 [-0.48935645 -0.47648878 -0.49028119 

## Generate distance metrics

### Helper Functions

In [None]:
from scipy.stats import wasserstein_distance, entropy
import numpy as np

def compute_wasserstein(real_data, synthetic_data, selected_columns):
    """
    Computes Wasserstein Distance between real and synthetic time-series data.

    """

    # Ensure both datasets have the same number of samples
    min_length = min(len(real_data), len(synthetic_data))
    real_trimmed = real_data[:min_length]  # Keep original order (no random sampling)
    synthetic_trimmed = synthetic_data[:min_length]  # Match size
    print(real_trimmed.shape,synthetic_trimmed.shape)

    wasserstein_results = {}

    # Compute Wasserstein Distance for each feature
    for i, col in enumerate(selected_columns):
        w_dist = wasserstein_distance(real_trimmed[:, i], synthetic_trimmed[:, i])
        wasserstein_results[col] = w_dist
        print(f"{w_dist}")

    return wasserstein_results

def compute_kl_divergence(real_data, synthetic_data, selected_columns, bins=50):
    """
    Computes KL Divergence between real and synthetic time-series data.

    """

    # Ensure both datasets have the same number of samples
    min_length = min(len(real_data), len(synthetic_data))
    real_trimmed = real_data[:min_length]  # Keep original order
    synthetic_trimmed = synthetic_data[:min_length]  # Match size

    kl_results = {}

    for i, col in enumerate(selected_columns):
        # Compute histogram-based probability distributions
        real_hist, _ = np.histogram(real_trimmed[:, i], bins=bins, density=True)
        synth_hist, _ = np.histogram(synthetic_trimmed[:, i], bins=bins, density=True)

        # Avoid zero probabilities (KL Divergence is undefined for zero values)
        real_hist += 1e-10
        synth_hist += 1e-10

        # Compute KL Divergence
        kl_div = entropy(real_hist, synth_hist)
        kl_results[col] = kl_div
        print(f"{kl_div}")

    return kl_results

### Generate Metrics

In [None]:
# Assuming df_scaled is the DataFrame containing your scaled ETD data

# Compute Wasserstein Distance
wasserstein_results = compute_wasserstein(real_data, synthetic_data, selected_columns)
print("Wasserstein Distance Results:")
print(wasserstein_results)

# Compute KL Divergence
kl_results = compute_kl_divergence(real_data, synthetic_data, selected_columns)
print("KL Divergence Results:")
print(kl_results)

(113130, 10) (113130, 10)
0.4852279555622918
0.474036286285225
0.1876984601534289
0.44966887504583986
0.2530056852486972
0.4728836247111148
0.6521579490880941
0.17002787425002172
0.23972583890384996
0.23945477508683838
Wasserstein Distance Results:
{'seq_temporal_PAR (�mol/m�/s)': 0.4852279555622918, 'seq_temporal_SWDR (W/m�)': 0.474036286285225, 'seq_temporal_T (degC)': 0.1876984601534289, 'seq_temporal_Tdew (degC)': 0.44966887504583986, 'seq_temporal_VPmax (mbar)': 0.2530056852486972, 'seq_temporal_max. PAR (�mol/m�/s)': 0.4728836247111148, 'seq_temporal_p (mbar)': 0.6521579490880941, 'seq_temporal_raining (s)': 0.17002787425002172, 'seq_temporal_sh (g/kg)': 0.23972583890384996, 'seq_temporal_wv (m/s)': 0.23945477508683838}
4.242986387961396
3.845901913908606
10.411177489686091
11.515623165165554
6.884882158379057
3.383822733621905
11.367472776635818
0.5021326971410645
10.617129494327695
8.683382758410538
KL Divergence Results:
{'seq_temporal_PAR (�mol/m�/s)': 4.242986387961396, 'seq

# LSTM downstream