In [1]:
%store -r real_data
%store -r real_corr
%store -r column_names
%store -r synth
%store -r data_processor

In [2]:
models = ['bmk2018_ctgan_hp_bt100.pkl', 'bmk2018_ctgan_hp_bt256.pkl', 'bmk2018_ctgan.pkl',
          'bmk2018_ctgan_hp_bs128_ep1000.pkl', 'bmk2018_ctgan_hp_bs128_ep3000.pkl']
params = [
    {"batch_size":  128, "epochs": 500}, {"batch_size": 256, "epochs": 500}, {"batch_size": 500,
                                                                              "epochs": 500}, {"batch_size": 128, "epochs": 1000}, {"batch_size": 128, "epochs": 3000}
]

In [3]:
for model, param in zip(models, params):
    ctgan_synthesizer = synth.train_ctgan_synthesizer(
        model_name=model, **param)

    ctgan_synth_data = synth.generate_synthetic_sample(
        ctgan_synthesizer, 15000)

    synth.run_diagnostic(ctgan_synth_data)
    synth.run_evaluation(ctgan_synth_data)

Loading existing model
Generating report ...
(1/2) Evaluating Data Validity: : 100%|██████████| 66/66 [00:00<00:00, 1470.15it/s]
(2/2) Evaluating Data Structure: : 100%|██████████| 1/1 [00:00<00:00, 657.31it/s]

Overall Score: 100.0%

Properties:
- Data Validity: 100.0%
- Data Structure: 100.0%
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 66/66 [00:00<00:00, 97.55it/s] 
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 2145/2145 [00:20<00:00, 104.97it/s]

Overall Score: 92.36%

Properties:
- Column Shapes: 87.83%
- Column Pair Trends: 96.9%
Loading existing model
Generating report ...
(1/2) Evaluating Data Validity: : 100%|██████████| 66/66 [00:00<00:00, 1497.26it/s]
(2/2) Evaluating Data Structure: : 100%|██████████| 1/1 [00:00<00:00, 913.99it/s]

Overall Score: 100.0%

Properties:
- Data Validity: 100.0%
- Data Structure: 100.0%
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 66/66 [00:00<00:00, 103.51it/s]
(2/2) Evaluating C

## Final model

In [4]:
# best modelbased on evaluation is epoch 3000 with batch size 128
ctgan_synthesizer = synth.train_ctgan_synthesizer(
    model_name='bmk2018_ctgan_hp_bs128_ep3000.pkl')

Loading existing model


In [5]:
ctgan_synth_data = synth.generate_synthetic_sample(ctgan_synthesizer, 15000)

In [6]:
ctgan_corr = synth.generate_corr_matrix(df=ctgan_synth_data)
fig = synth.style_correlation_matrix(ctgan_corr)
data_processor.save_data(fig.to_html(), name='ctgan_corr.html')

ctgan_corr.html uploaded


In [7]:
ctgan_corr_diff = synth.generate_corr_matrix(real_corr - ctgan_corr)
fig = synth.style_correlation_matrix(ctgan_corr_diff)
data_processor.save_data(fig.to_html(), name='ctgan_corr_diff.html')

ctgan_corr_diff.html uploaded


In [8]:
plot = synth.visualize_data(synthetic_data=ctgan_synth_data,
                            column_names=column_names, fig_name="ctgan_figures.png")

with open('./data/figures/ctgan_figures.png', 'rb') as p:
    data_processor.save_data(p, name='ctgan_figures.png')

ctgan_figures.png uploaded


In [9]:
# synth.run_diagnostic(ctgan_synth_data)
# synth.run_evaluation(ctgan_synth_data)

In [10]:
columns = [c for c in synth.df.columns if c not in ['ESI_Key', 'RaterType']]

stats = data_processor.compare_datasets(real_data=real_data,
                                        synthetic_data=ctgan_synth_data, columns=columns)

# Output

In [11]:
data_processor.save_data(df=ctgan_synth_data.to_csv(
    index=False), name='ctgan_synth_data')
data_processor.save_data(df=ctgan_corr.to_csv(index=False), name='ctgan_corr')
data_processor.save_data(df=ctgan_corr_diff.to_csv(
    index=False), name='ctgan_corr_diff')
data_processor.save_data(df=stats.to_csv(index=False), name='ctgan_stats')

ctgan_synth_data uploaded
ctgan_corr uploaded
ctgan_corr_diff uploaded
ctgan_stats uploaded


In [12]:
figure_name = "ctgan_cumsum_figure.png"

plot = synth.visualize_cumsum(synthetic_data=ctgan_synth_data,
                              column_names=column_names, fig_name=figure_name)

with open(f'./data/figures/{figure_name}', 'rb') as p:
    data_processor.save_data(p, name=figure_name)

ctgan_cumsum_figure.png uploaded
