# Notebook for documenting how the synthetic data can be generated

In this notebook we show our process for generating the synthetic data used for the model benchmark. The data is generated using the SynthCity library for CTGAN, Datasynthesizers, and the synthpop CART model accessed in R. 

This part is here for reproducibility purposes, but is not required to run the benchmark part if using the existing datasets in the repository. Since generating everything can be a long and slow process, we recommend running the same code in the `generate_datasets.py` script on a more powerful machine.

In [1]:
import glob, os

import numpy as np
import pandas as pd

from tqdm import notebook
from itertools import product
from joblib import Parallel, delayed

from syntheval import SynthEval
from utils.utils import prepare_directories
from utils.gen_interfacer import generate_synthetic_data

np.random.seed(0)   # remove seed lock for true randomness

### Parameters

NUM_REPEATS = 3

target_vars = {
    'data/small_few_atts/diabetes': 'Outcome',
    'data/small_few_atts/penguins': 'species',
    'data/small_few_atts/titanic': 'Survived',
    'data\small_some_atts\cervical_cancer': 'Biopsy',
    'data\small_some_atts\derm': 'class',
    'data\small_some_atts\spect': 'OVERALL_DIAGNOSIS',
    'data\small_many_atts\diabetic_mellitus': 'TYPE',
    'data\small_many_atts\mice_protein': 'class',
    'data\small_many_atts\spectrometer': 'ID-type',
    'data\large_few_atts\space_titanic': 'Survived',
    'data\large_few_atts\stroke': 'stroke',
    'data\large_few_atts\winequality': 'quality',
    'data\large_some_atts\cardiotocography': 'Class',
    'data\large_some_atts\one_hundred_plants': 'Class',
    'data\large_some_atts\steel_faults': 'class',
    'data/large_many_atts/bankruptcy': 'Bankrupt',
    'data/large_many_atts/speed_dating': 'match',
    'data/large_many_atts/yeast_ml8': 'class1',
}

generative_model_names = ['ctgan', 'datasynthesizer', 'synthpop']

GENERATE_MISSING = True
GENERATE_OVERWRITE = False

### Code to generate synthetic datasets
def work_func(iterable, num_reps = 3):
    dataset_name, generative_model =  iterable
    try: 
        if GENERATE_OVERWRITE: raise Exception("Regenerate the synthetic datasets")
        df_syn = pd.read_csv(dataset_name + '_' + generative_model + '.csv')
    except:
        if not GENERATE_MISSING: return None
        dfs = {i: generate_synthetic_data(dataset_name, generative_model) for i in range(num_reps)}
        
        df_train = pd.read_csv(dataset_name +'_train.csv')
        df_test = pd.read_csv(dataset_name + '_test.csv')

        SE = SynthEval(df_train, df_test, verbose=False)
        df_comb , _ = SE.benchmark(dfs, target_vars[dataset_name],'./fast_eval.json')

        idx = df_comb['rank'].idxmax() # get the best dataset

        dfs[idx].to_csv(dataset_name + '_' + generative_model + '.csv', index=False)

        for f in glob.glob("SE_benchmark_*.csv"):
            os.remove(f)

    return None

prepare_directories(target_vars)

iterables_list = list(product(target_vars, generative_model_names))
_ = Parallel(n_jobs=-2)(delayed(work_func)(iterable, NUM_REPEATS) for iterable in notebook.tqdm(iterables_list))


    The default C++ compiler could not be found on your system.
    You need to either define the CXX environment variable or a symlink to the g++ command.
    For example if g++-8 is the command you can do
      import os
      os.environ['CXX'] = 'g++-8'
    
