In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import networkx as nx

from sklearn.preprocessing import OrdinalEncoder

from datgan import stats_assessment
from datgan import ml_assessment, transform_results

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Load the original and synthetic data

In [2]:
df_orig = pd.read_csv('./data/CMAP.csv', index_col=False)

In [4]:
continuous_columns = ["distance", "age", "departure_time"]

We selected synthetic datasets generated from the CMAP datasets using different state-of-the-art models to be tested against each other.

In [5]:
synth_files = ['data/TGAN.csv',
               'data/CTGAN.csv',
               'data/CTABGAN.csv',
               'data/TVAE.csv',
               'data/DATGAN.csv',
               'data/DATGAN_ci.csv'
              ]

In [6]:
results_path = './results/'

if not os.path.exists(results_path):
    os.makedirs(results_path)

In [7]:
stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']

# Statistical assessment

The statistical assessment can be done on three aggregation levels. We encourage you to save results regularly since it can take quite some time to compute the statistical assessments, especially on the third level of aggregation.

## First level

In [8]:
pickle_name = 'stats_first_level.pickle'
aggregation_level = 1

first_lvl_stats = {}

try:
    first_lvl_stats = pickle.load(open(results_path + pickle_name, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')

No previous results found, starting fresh


In [9]:
for i, f in enumerate(synth_files):
    
    file_name = f.split('/')[-1].split('.')[0]
    
    if file_name in first_lvl_stats:
        print("Results for file \033[1m{}\033[0m ({}/{}) already exists!".format(file_name, i+1, len(synth_files)))
    else:
        print("Preparing stats for file \033[1m{}\033[0m ({}/{})".format(file_name, i+1, len(synth_files)))

        first_lvl_stats[file_name] = {}
        
        df_synth = pd.read_csv(f, index_col=False)
                
        stats = stats_assessment(df_orig, df_synth, continuous_columns, aggregation_level)
        
        first_lvl_stats[file_name] = stats
        
    pickle.dump(first_lvl_stats, open(results_path + pickle_name, 'wb'))   
    
print("\033[1mFINISHED!\033[0m")

Preparing stats for file [1mTGAN[0m (1/6)
Preparing stats for file [1mCTGAN[0m (2/6)
Preparing stats for file [1mCTABGAN[0m (3/6)
Preparing stats for file [1mTVAE[0m (4/6)
Preparing stats for file [1mDATGAN[0m (5/6)
Preparing stats for file [1mDATGAN_ci[0m (6/6)
[1mFINISHED![0m


### Human-readable results

This is an example how to obtain more human-readable results. With the first level of aggregation, we can check the difference between continuous and categorical columns.

In [10]:
res = {}

for test in ['all', 'cont', 'cat']:
    
    res[test] = {}
    
    if test == 'all':
        cols = df_orig.columns
    elif test == 'cont':
        cols = continuous_columns
    elif test == 'cat':
        cols = set(df_orig.columns) - set(continuous_columns)
        
    for s in stats_str:
        res[test][s] = {}
        
    for m in first_lvl_stats.keys():

        for s in stats_str:
            res[test][s][m] = []
            
            for c in cols:
                res[test][s][m].append(first_lvl_stats[m][c][s])

In [11]:
avg = {}

for test in ['all', 'cont', 'cat']:
    
    avg[test] = {}

    for s in stats_str:
        avg[test][s] = {}

        for m in first_lvl_stats.keys():
            avg[test][s][m] = {
                'mean': np.mean(res[test][s][m]),
                'std': np.std(res[test][s][m])
            }

In [12]:
for test in ['all', 'cont', 'cat']:
    
    if test == 'all':
        str_ = 'on all columns'
    elif test == 'cont':
        str_ = 'on continuous columns'
    elif test == 'cat':
        str_ = 'on categorical columns'
        
    for s in ['srmse']:#stats:
        print('Ranking {} based on {}:'.format(str_, s.upper()))

        if s in ['r2', 'corr']:
            sorted_dct = {k: v for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])[::-1]}
        else:
            sorted_dct = {k: v for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])}

        for i, item in enumerate(sorted_dct):
            print('  {:>2}. {:<15} - {:.2e} ± {:.2e}'.format(i+1, item, sorted_dct[item]['mean'], sorted_dct[item]['std']))
        print()


Ranking on all columns based on SRMSE:
   1. DATGAN_ci       - 5.42e-02 ± 5.56e-02
   2. DATGAN          - 6.02e-02 ± 5.58e-02
   3. TGAN            - 1.34e-01 ± 1.40e-01
   4. CTABGAN         - 2.13e-01 ± 1.38e-01
   5. TVAE            - 2.32e-01 ± 1.64e-01
   6. CTGAN           - 2.34e-01 ± 1.08e-01

Ranking on continuous columns based on SRMSE:
   1. CTABGAN         - 1.20e-01 ± 6.96e-02
   2. DATGAN_ci       - 1.25e-01 ± 7.63e-02
   3. DATGAN          - 1.35e-01 ± 8.49e-02
   4. TVAE            - 1.61e-01 ± 3.52e-02
   5. CTGAN           - 1.87e-01 ± 1.01e-01
   6. TGAN            - 4.04e-01 ± 2.12e-02

Ranking on categorical columns based on SRMSE:
   1. DATGAN_ci       - 3.64e-02 ± 2.87e-02
   2. DATGAN          - 4.13e-02 ± 1.77e-02
   3. TGAN            - 6.61e-02 ± 3.94e-02
   4. CTABGAN         - 2.36e-01 ± 1.41e-01
   5. CTGAN           - 2.45e-01 ± 1.07e-01
   6. TVAE            - 2.49e-01 ± 1.78e-01



## Second and third level

In [13]:
pickle_name = 'stats_second_level.pickle'
aggregation_level = 2

second_lvl_stats = {}

try:
    second_lvl_stats = pickle.load(open(results_path + pickle_name, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')

No previous results found, starting fresh


In [14]:
for i, f in enumerate(synth_files):
    
    file_name = f.split('/')[-1].split('.')[0]
    
    if file_name in second_lvl_stats:
        print("Results for file \033[1m{}\033[0m ({}/{}) already exists!".format(file_name, i+1, len(synth_files)))
    else:
        print("Preparing stats for file \033[1m{}\033[0m ({}/{})".format(file_name, i+1, len(synth_files)))

        second_lvl_stats[file_name] = {}
        
        df_synth = pd.read_csv(f, index_col=False)
                
        stats = stats_assessment(df_orig, df_synth, continuous_columns, aggregation_level)
        
        second_lvl_stats[file_name] = stats
        
    pickle.dump(second_lvl_stats, open(results_path + pickle_name, 'wb'))  
    
print("\033[1mFINISHED!\033[0m")

Preparing stats for file [1mTGAN[0m (1/6)
Preparing stats for file [1mCTGAN[0m (2/6)
Preparing stats for file [1mCTABGAN[0m (3/6)
Preparing stats for file [1mTVAE[0m (4/6)
Preparing stats for file [1mDATGAN[0m (5/6)
Preparing stats for file [1mDATGAN_ci[0m (6/6)
[1mFINISHED![0m


### Human-readable results

As for the first aggregation level, we can make the results more human-readable.

In [15]:
res = {}
        
for s in stats_str:
    res[s] = {}

for m in second_lvl_stats.keys():

    for s in stats_str:
        res[s][m] = []

        for c in second_lvl_stats[m].keys():
            res[s][m].append(second_lvl_stats[m][c][s])

In [16]:
avg = {}

for s in stats_str:
    avg[s] = {}

    for m in first_lvl_stats.keys():
        avg[s][m] = {
            'mean': np.mean(res[s][m]),
            'std': np.std(res[s][m])
        }

In [17]:
for s in ['srmse']:#stats:
    print('Ranking based on {} for aggregation level {}:'.format(s.upper(), aggregation_level))

    if s in ['r2', 'corr']:
        sorted_dct = {k: v for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])[::-1]}
    else:
        sorted_dct = {k: v for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])}

    for i, item in enumerate(sorted_dct):
        print('  {:>2}. {:<15} - {:.2e} ± {:.2e}'.format(i+1, item, sorted_dct[item]['mean'], sorted_dct[item]['std']))
    print()


Ranking based on SRMSE for aggregation level 2:
   1. DATGAN_ci       - 1.63e-01 ± 1.05e-01
   2. DATGAN          - 1.66e-01 ± 1.06e-01
   3. TGAN            - 3.40e-01 ± 2.45e-01
   4. CTABGAN         - 4.88e-01 ± 2.14e-01
   5. CTGAN           - 5.09e-01 ± 1.69e-01
   6. TVAE            - 5.62e-01 ± 2.88e-01



We can do the same thing for the third level of aggregation. We just don't show it here. 

# Machine Learning efficacy

For this test, you first need to check if there are some values that appear with low probability. If it's the case, we recommend to replace these values.

In [18]:
def check_low_appearing_vars(df):
    
    for c in df.columns:
        val = df[c].value_counts()
        if len(val) < 20:
            val = val/len(df)
            if any(val < 0.01) and c != 'choice':
                print('Variable {}: '.format(c))
                for idx, v in zip(val.index, val):
                    if v < 0.01:
                        print('  {} - {:.2f}% ({:d})'.format(idx, 100*v, int(v*len(df))))
                print()
                
def replace_low_appearing_values(df):
    
    dct_ = {}
    for i in df['hh_vehicles'].unique():
        if i >= 5:
            dct_[i] = '5+'
        else:
            dct_[i] = str(i)        
    df['hh_vehicles'].replace(dct_, inplace=True)

    dct_ = {}
    for i in df['hh_size'].unique():
        if i >= 6:
            dct_[i] = '6+'
        else:
            dct_[i] = str(i)        
    df['hh_size'].replace(dct_, inplace=True)

    dct_ = {}
    for i in df['hh_bikes'].unique():
        if i >= 6:
            dct_[i] = '6+'
        else:
            dct_[i] = str(i)        
    df['hh_bikes'].replace(dct_, inplace=True)      

In [19]:
check_low_appearing_vars(df_orig)

Variable hh_vehicles: 
  5 - 0.67% (60)
  6 - 0.26% (23)
  7 - 0.08% (7)
  8 - 0.06% (5)

Variable hh_size: 
  7 - 0.36% (32)
  8 - 0.13% (12)

Variable hh_bikes: 
  6 - 0.77% (69)
  7 - 0.27% (24)



In [20]:
replace_low_appearing_values(df_orig)

In [21]:
check_low_appearing_vars(df_orig)

We removed the low appearing values => we can continue!

In [22]:
# Define the categorical columns
categorical_columns = list(set(df_orig.columns) - set(continuous_columns))

We need to encode the categorical columns

In [23]:
enc = OrdinalEncoder()
df_orig[categorical_columns] = enc.fit_transform(df_orig[categorical_columns])

As for the statistical tests, we recommend to save the files between each synthetic dataset.

In [24]:
pickle_name = 'ml_efficacy.pickle'

cv_modelscores = {}

try:
    cv_modelscores = pickle.load(open(results_path + pickle_name, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')

No previous results found, starting fresh


In [25]:
for i, f in enumerate(synth_files):
    
    file_name = f.split('/')[-1].split('.')[0]
    
    if file_name in cv_modelscores:
        print("Results for file \033[1m{}\033[0m ({}/{}) already exists!".format(file_name, i+1, len(synth_files)))
    else:
        print("Preparing stats for file \033[1m{}\033[0m ({}/{})".format(file_name, i+1, len(synth_files)))

        cv_modelscores[file_name] = {}
        
        # Load the synthetic dataset
        df_synth = pd.read_csv(f, index_col=False)
        
        # Replace the values rarely appearing
        replace_low_appearing_values(df_synth)
        
        # Encode the synthetic dataset
        df_synth[categorical_columns] = enc.transform(df_synth[categorical_columns])
                
        res = ml_assessment(df_orig, df_synth, continuous_columns, categorical_columns)
        
        cv_modelscores[file_name] = res
        
    pickle.dump(cv_modelscores, open(results_path + pickle_name, 'wb'))   
    
print("\033[1mFINISHED!\033[0m")

Preparing stats for file [1mTGAN[0m (1/6)
Preparing stats for file [1mCTGAN[0m (2/6)
Preparing stats for file [1mCTABGAN[0m (3/6)
Preparing stats for file [1mTVAE[0m (4/6)
Preparing stats for file [1mDATGAN[0m (5/6)
Preparing stats for file [1mDATGAN_ci[0m (6/6)
[1mFINISHED![0m                  


Raw results are a bit difficult to assess. Therefore, we provide a way to get an ordered list of the synthetic datasets tested. However, for this, you need to run the ML assessment on the original dataset with a specific key!

In [26]:
if 'original' in cv_modelscores:
    print("Results for file \033[1m{}\033[0m already exists!".format('original'))
else:
    print("Preparing stats for file \033[1m{}\033[0m".format('original'))

    res = ml_assessment(df_orig, df_orig, continuous_columns, categorical_columns)
    cv_modelscores['original'] = res
    pickle.dump(cv_modelscores, open(results_path + pickle_name, 'wb'))   
    print("\033[1mFINISHED!\033[0m")

Preparing stats for file [1moriginal[0m
[1mFINISHED![0m                  


In [27]:
cont_sorted, cat_sorted = transform_results(cv_modelscores, continuous_columns, categorical_columns)

In [28]:
i=1
print('   | {:<30} | {:<30}'.format('categorical', 'continuous'))
print('-----------------------------------------------------------')
for a, b in zip(cat_sorted, cont_sorted):
    print('{:>2} | {:<30} | {:<30}'.format(i, '{:<12}: {:.3f}'.format(a[0], a[1]), '{:<12}: {:.3f}'.format(b[0], b[1])))
    i+=1

   | categorical                    | continuous                    
-----------------------------------------------------------
 1 | original    : -2.213           | original    : 2.573           
 2 | DATGAN_ci   : 0.782            | DATGAN_ci   : 3.093           
 3 | DATGAN      : 0.790            | DATGAN      : 3.108           
 4 | TGAN        : 1.022            | CTABGAN     : 3.242           
 5 | CTGAN       : 1.486            | TGAN        : 3.264           
 6 | CTABGAN     : 1.539            | TVAE        : 3.286           
 7 | TVAE        : 4.427            | CTGAN       : 3.332           
