**CAUTION: TAKES A LONG TIME.**  
Try using the stored trained model instead.

In [1]:
import os
import time
import math
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import ks_2samp
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.single_table import evaluate_quality as st_evaluate_quality
from sdv.evaluation.multi_table import evaluate_quality as mt_evaluate_quality
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.multi_table import HMASynthesizer

In [2]:
def cos_test(df1, df2):
    cos_sim = metrics.pairwise.cosine_similarity(df1.values.T, df2.values.T)
    mean_cos_sim = np.mean(cos_sim)
    return mean_cos_sim

In [21]:
def batch_cos_test(collection1, collection2):
    test_dict = {}
    for df_name in collection1.keys():
        mean_cos_sim = cos_test(collection2[df_name], collection1[df_name])
        test_dict[df_name] = 1-mean_cos_sim
    return test_dict

In [4]:
def ks_test(df1, df2):
    ks_stats = []
    p_values = []
    for column in df1.columns:
        ks_stat, ks_p_value = ks_2samp(df1[column], df2[column])
        ks_stats.append(ks_stat)
        p_values.append(ks_p_value)
    mean_ks_stat = np.mean(ks_stats)
    mean_p_value = np.mean(p_values)
    return mean_ks_stat, mean_p_value

In [22]:
def batch_ks_test(collection1, collection2):
    stats_dict = {}
    for df_name in collection1:
        ks_results, p_value = ks_test(collection1[df_name], collection2[df_name])
        stats_dict[df_name] = 1-ks_results
    return stats_dict

# Load Processed Data From Generation Stage

In [6]:
with open('pkl/real_data_collection.pkl', 'rb') as f:
    real_data_collection = pickle.load(f)

In [7]:
with open('pkl/synthetic_data_collection.pkl', 'rb') as f:
    synthetic_data_collection = pickle.load(f)

In [8]:
with open('pkl/sdv_metadata.pkl', 'rb') as f:
    sdv_metadata = pickle.load(f)

# Benchmark

In [9]:
generation_dict = {k:{'nrows':len(v)} for k,v in real_data_collection.items()}

In [10]:
generation_dict

{'agency': {'nrows': 15},
 'calendar': {'nrows': 121},
 'calendar_dates': {'nrows': 674},
 'routes': {'nrows': 215},
 'stops': {'nrows': 6714},
 'stop_times': {'nrows': 966790},
 'trips': {'nrows': 32403}}

## GaussianCopula

In [12]:
def collection_gaussiancopula_training(data_collection, multi_metadata):
    generator_dict = {}
    for df_name, df in data_collection.items():
        synthesizer = GaussianCopulaSynthesizer(
            multi_metadata.tables[df_name],
            enforce_min_max_values=True,
            enforce_rounding=True,
            default_distribution='norm',
        )
        synthesizer.fit(df)
        synthesizer.save(
            filepath='models_single/'+df_name+'_gc.pkl'
        )

In [13]:
%%time

collection_gaussiancopula_training(real_data_collection, sdv_metadata)

CPU times: total: 5.34 s
Wall time: 5.39 s


In [12]:
%%time

gc_synthetic_data_collection = {}
for df_name, content in generation_dict.items():
    synthesizer = GaussianCopulaSynthesizer.load(
        filepath='models_single/'+df_name+'_gc.pkl'
    )
    gc_synthetic_data_collection[df_name] = synthesizer.sample(
        num_rows=content['nrows']
    )

CPU times: total: 14.4 s
Wall time: 9.8 s


In [13]:
batch_cos_test(gc_synthetic_data_collection, real_data_collection)

{'agency': '96.0% Similar',
 'calendar': '47.2% Similar',
 'calendar_dates': '9.8% Similar',
 'routes': '70.7% Similar',
 'stops': '82.7% Similar',
 'stop_times': '54.6% Similar',
 'trips': '54.7% Similar'}

In [23]:
batch_ks_test(gc_synthetic_data_collection, real_data_collection)

  ks_stat, ks_p_value = ks_2samp(df1[column], df2[column])


{'agency': 1.0,
 'calendar': 0.9049586776859504,
 'calendar_dates': 0.6666666666666667,
 'routes': 0.6093023255813954,
 'stops': 0.7977029755403303,
 'stop_times': 0.7459531025351938,
 'trips': 0.4921226429651575}

In [15]:
individual_report_collection = {}
for df_name in real_data_collection.keys():
    print(f'[{df_name}]:')
    individual_report_collection[df_name] = st_evaluate_quality(
        real_data=real_data_collection[df_name],
        synthetic_data=gc_synthetic_data_collection[df_name],
        metadata=sdv_metadata.tables[df_name])
    print()
    print('--------------------')
    print()

[agency]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 60.59it/s]



Overall Quality Score: 100.0%

Properties:
Column Shapes: 100.0%
Column Pair Trends: 100.0%

--------------------

[calendar]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 19.80it/s]



Overall Quality Score: 82.03%

Properties:
Column Shapes: 89.44%
Column Pair Trends: 74.62%

--------------------

[calendar_dates]:


Creating report: 100%|█████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 2000.86it/s]



Overall Quality Score: 78.75%

Properties:
Column Shapes: 91.02%
Column Pair Trends: 66.47%

--------------------

[routes]:


Creating report: 100%|██████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 102.54it/s]



Overall Quality Score: 65.74%

Properties:
Column Shapes: 74.57%
Column Pair Trends: 56.9%

--------------------

[stops]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 15.68it/s]



Overall Quality Score: 72.68%

Properties:
Column Shapes: 76.93%
Column Pair Trends: 68.42%

--------------------

[stop_times]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:13<00:00,  3.47s/it]



Overall Quality Score: 82.07%

Properties:
Column Shapes: 87.16%
Column Pair Trends: 76.98%

--------------------

[trips]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.83it/s]


Overall Quality Score: 76.13%

Properties:
Column Shapes: 84.09%
Column Pair Trends: 68.17%

--------------------






## CTGAN

In [16]:
training_parameter_dict = {'agency': {'epochs': 10},
                           'calendar': {'epochs': 10},
                           'calendar_dates': {'epochs': 10},
                           'routes': {'epochs': 10},
                           'stops': {'epochs': 10},
                           'stop_times': {'epochs': 10},
                           'trips': {'epochs': 10}}

In [17]:
def collection_gtgan_training(data_collection, training_parameter_dict, multi_metadata):
    generator_dict = {}
    for df_name, df in data_collection.items():
        print(f'[{df_name}]:')
        synthesizer = CTGANSynthesizer(
            multi_metadata.tables[df_name],
            enforce_rounding=True,
            epochs=training_parameter_dict[df_name]['epochs'],
            verbose=True,
            batch_size=400
        )
        synthesizer.fit(df)
        synthesizer.save(
            filepath='models_single/'+df_name+'_ctgan.pkl'
        )
        print()

In [18]:
%%time

collection_gtgan_training(real_data_collection, training_parameter_dict, sdv_metadata)

[agency]:
Epoch 1, Loss G:  0.0865,Loss D:  0.0002
Epoch 2, Loss G:  0.0952,Loss D:  0.0106
Epoch 3, Loss G:  0.1292,Loss D:  0.0175
Epoch 4, Loss G:  0.1473,Loss D:  0.0002
Epoch 5, Loss G:  0.1777,Loss D:  0.0333
Epoch 6, Loss G:  0.1878,Loss D:  0.0109
Epoch 7, Loss G:  0.2007,Loss D:  0.0521
Epoch 8, Loss G:  0.1938,Loss D: -0.0571
Epoch 9, Loss G:  0.2226,Loss D: -0.0216
Epoch 10, Loss G:  0.2551,Loss D: -0.0173

[calendar]:
Epoch 1, Loss G:  0.7788,Loss D: -0.0198
Epoch 2, Loss G:  0.7605,Loss D: -0.0103
Epoch 3, Loss G:  0.7360,Loss D: -0.0476
Epoch 4, Loss G:  0.7320,Loss D: -0.0328
Epoch 5, Loss G:  0.7522,Loss D: -0.0274
Epoch 6, Loss G:  0.7630,Loss D: -0.0403
Epoch 7, Loss G:  0.7464,Loss D:  0.0097
Epoch 8, Loss G:  0.7251,Loss D: -0.0085
Epoch 9, Loss G:  0.7650,Loss D: -0.0063
Epoch 10, Loss G:  0.7524,Loss D:  0.0066

[calendar_dates]:
Epoch 1, Loss G:  0.8342,Loss D: -0.0011
Epoch 2, Loss G:  0.8157,Loss D:  0.0096
Epoch 3, Loss G:  0.8172,Loss D:  0.0283
Epoch 4, Loss

In [15]:
%%time

ctgan_synthetic_data_collection = {}
for df_name, content in generation_dict.items():
    synthesizer = GaussianCopulaSynthesizer.load(
        filepath='models_single/'+df_name+'_ctgan.pkl'
    )
    ctgan_synthetic_data_collection[df_name] = synthesizer.sample(
        num_rows=content['nrows']
    )

CPU times: total: 2min 56s
Wall time: 50 s


In [16]:
batch_cos_test(ctgan_synthetic_data_collection, real_data_collection)

{'agency': '96.0% Similar',
 'calendar': '43.8% Similar',
 'calendar_dates': '9.8% Similar',
 'routes': '64.5% Similar',
 'stops': '83.6% Similar',
 'stop_times': '52.7% Similar',
 'trips': '54.8% Similar'}

In [17]:
batch_ks_test(ctgan_synthetic_data_collection, real_data_collection)

  ks_stat, ks_p_value = ks_2samp(df1[column], df2[column])


{'agency': '100.0% similar in distribution',
 'calendar': '84.4% similar in distribution',
 'calendar_dates': '63.1% similar in distribution',
 'routes': '50.5% similar in distribution',
 'stops': '86.3% similar in distribution',
 'stop_times': '72.0% similar in distribution',
 'trips': '50.1% similar in distribution'}

In [20]:
individual_report_collection = {}
for df_name in real_data_collection.keys():
    print(f'[{df_name}]:')
    individual_report_collection[df_name] = st_evaluate_quality(
        real_data=real_data_collection[df_name],
        synthetic_data=ctgan_synthetic_data_collection[df_name],
        metadata=sdv_metadata.tables[df_name])
    print('--------------------')
    print()

[agency]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 61.53it/s]



Overall Quality Score: 100.0%

Properties:
Column Shapes: 100.0%
Column Pair Trends: 100.0%
--------------------

[calendar]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 18.18it/s]



Overall Quality Score: 73.98%

Properties:
Column Shapes: 82.37%
Column Pair Trends: 65.58%
--------------------

[calendar_dates]:


Creating report: 100%|█████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 4002.20it/s]



Overall Quality Score: 66.39%

Properties:
Column Shapes: 85.31%
Column Pair Trends: 47.48%
--------------------

[routes]:


Creating report: 100%|██████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 124.97it/s]



Overall Quality Score: 36.51%

Properties:
Column Shapes: 47.75%
Column Pair Trends: 25.27%
--------------------

[stops]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.83it/s]



Overall Quality Score: 80.9%

Properties:
Column Shapes: 84.48%
Column Pair Trends: 77.33%
--------------------

[stop_times]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:13<00:00,  3.47s/it]



Overall Quality Score: 81.33%

Properties:
Column Shapes: 86.02%
Column Pair Trends: 76.65%
--------------------

[trips]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 16.19it/s]


Overall Quality Score: 88.24%

Properties:
Column Shapes: 92.16%
Column Pair Trends: 84.32%
--------------------






## TVAE

In [21]:
training_parameter_dict = {'agency': {'epochs': 10},
                           'calendar': {'epochs': 10},
                           'calendar_dates': {'epochs': 10},
                           'routes': {'epochs': 10},
                           'stops': {'epochs': 10},
                           'stop_times': {'epochs': 10},
                           'trips': {'epochs': 10}}

In [22]:
def collection_tvae_training(data_collection, training_parameter_dict, multi_metadata):
    generator_dict = {}
    for df_name, df in data_collection.items():
        synthesizer = TVAESynthesizer(
            multi_metadata.tables[df_name],
            enforce_min_max_values=True,
            enforce_rounding=True,
            epochs=training_parameter_dict[df_name]['epochs'],
            batch_size=400
        )
        synthesizer.fit(df)
        synthesizer.save(
            filepath='models_single/'+df_name+'_tvae.pkl'
        )

In [23]:
%%time

collection_tvae_training(real_data_collection, training_parameter_dict, sdv_metadata)

CPU times: total: 26min 21s
Wall time: 7min 20s


In [18]:
%%time

tvae_synthetic_data_collection = {}
for df_name, content in generation_dict.items():
    synthesizer = TVAESynthesizer.load(
        filepath='models_single/'+df_name+'_tvae.pkl'
    )
    tvae_synthetic_data_collection[df_name] = synthesizer.sample(
        num_rows=content['nrows']
    )

CPU times: total: 1min 2s
Wall time: 31.9 s


In [19]:
batch_cos_test(tvae_synthetic_data_collection, real_data_collection)

{'agency': '96.0% Similar',
 'calendar': '69.6% Similar',
 'calendar_dates': '8.5% Similar',
 'routes': '83.4% Similar',
 'stops': '96.3% Similar',
 'stop_times': '59.7% Similar',
 'trips': '54.9% Similar'}

In [20]:
batch_ks_test(tvae_synthetic_data_collection, real_data_collection)

{'agency': '100.0% similar in distribution',
 'calendar': '73.8% similar in distribution',
 'calendar_dates': '49.2% similar in distribution',
 'routes': '67.5% similar in distribution',
 'stops': '90.3% similar in distribution',
 'stop_times': '75.8% similar in distribution',
 'trips': '48.5% similar in distribution'}

In [25]:
individual_report_collection = {}
for df_name in real_data_collection.keys():
    print(f'[{df_name}]:')
    individual_report_collection[df_name] = st_evaluate_quality(
        real_data=real_data_collection[df_name],
        synthetic_data=tvae_synthetic_data_collection[df_name],
        metadata=sdv_metadata.tables[df_name])
    print()
    print('--------------------')
    print()

[agency]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 72.71it/s]



Overall Quality Score: 100.0%

Properties:
Column Shapes: 100.0%
Column Pair Trends: 100.0%

--------------------

[calendar]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.10it/s]



Overall Quality Score: 63.59%

Properties:
Column Shapes: 70.89%
Column Pair Trends: 56.29%

--------------------

[calendar_dates]:


Creating report: 100%|█████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1999.67it/s]



Overall Quality Score: 55.86%

Properties:
Column Shapes: 64.84%
Column Pair Trends: 46.88%

--------------------

[routes]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 93.00it/s]



Overall Quality Score: 87.52%

Properties:
Column Shapes: 88.68%
Column Pair Trends: 86.36%

--------------------

[stops]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 13.79it/s]



Overall Quality Score: 86.75%

Properties:
Column Shapes: 89.04%
Column Pair Trends: 84.47%

--------------------

[stop_times]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:13<00:00,  3.43s/it]



Overall Quality Score: 86.25%

Properties:
Column Shapes: 89.96%
Column Pair Trends: 82.54%

--------------------

[trips]:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.66it/s]


Overall Quality Score: 60.13%

Properties:
Column Shapes: 73.42%
Column Pair Trends: 46.84%

--------------------






## HMA

In [26]:
synthesizer = HMASynthesizer(sdv_metadata)

In [None]:
%%time

synthesizer.fit(real_data_collection)

# KS Comparison Table

In [30]:
def create_comparison_table(real_collection, fake_collections):
    comparison_dict = {}
    for model_name, fake_collection in fake_collections.items():
        ks_test_scores = batch_ks_test(real_collection, fake_collection)
        comparison_dict[model_name] = ks_test_scores
    comparison_table = pd.DataFrame(comparison_dict)
    return comparison_table

In [31]:
benchmark_collection = {
    'GaussianCopula':gc_synthetic_data_collection,
    'CTGAN':ctgan_synthetic_data_collection,
    'TVAE':tvae_synthetic_data_collection,
    'New Approach':synthetic_data_collection
}

create_comparison_table(real_data_collection, benchmark_collection).round(3)

  ks_stat, ks_p_value = ks_2samp(df1[column], df2[column])


Unnamed: 0,GaussianCopula,CTGAN,TVAE,New Approach
agency,1.0,1.0,1.0,1.0
calendar,0.905,0.844,0.738,0.569
calendar_dates,0.667,0.631,0.492,0.691
routes,0.609,0.505,0.675,0.558
stops,0.798,0.863,0.903,0.524
stop_times,0.746,0.72,0.758,0.684
trips,0.492,0.501,0.485,0.716
