In [1]:
%store -r real_data
%store -r real_corr
%store -r column_names
%store -r synth
%store -r data_processor
%store -r results_folder

IPython not installed.


In [2]:
results_folder = f'{results_folder}/ctgan'

In [3]:
import itertools
import pandas as pd

batch_sizes = [128, 256, 500]
epochs_list = [500, 1000, 2000, 3000]

models = [f"bmk2018_ctgan_hp_bs{bs}_ep{
    ep}.pkl" for bs in batch_sizes for ep in epochs_list]
params = [{"batch_size": bs, "epochs": ep}
          for bs in batch_sizes for ep in epochs_list]

batch_epoch_combinations = list(itertools.product(batch_sizes, epochs_list))

In [4]:
def get_params(name: str):
    pattern = r"bs(\d+)_ep(\d+)"
    match = re.search(pattern, name)

    if match:
        bs_value = int(match.group(1))
        ep_value = int(match.group(2))

    else:
        bs_value = 256
        ep_value = 500

    return bs_value, ep_value

In [None]:
report_df = pd.DataFrame(columns=["Model", "Batch size/Epochs", "Diagnostic Score",
                         "Data Validity", "Data Structure", "Evaluation Score", "Column Shapes", "Column Pair"])

print("Retrieving model and analyzing data. The output will be a dataframe")
for model, param, combination in zip(models, params, batch_epoch_combinations):

    bs, ep = get_params(model)

    ctgan_synthesizer = synth.train_ctgan_synthesizer(
        model_name=model, **param)

    # plot the loss
    fig, fig_name = synth.get_loss_values_plot(bs, ep, ctgan_synthesizer)
    fig.show()
    with open(fig_name, 'rb') as p:
        data_processor.save_data(p, name=fig_name, folder=f"{
                                 results_folder}/bs-{bs}/epoch-{ep}")

    ctgan_synth_data = synth.generate_synthetic_sample(
        ctgan_synthesizer, 15000)

    diag = synth.run_diagnostic(ctgan_synth_data, model)
    qual = synth.run_evaluation(ctgan_synth_data, model)

    diag_props = diag.get_properties()
    eval_props = qual.get_properties()

    diag_score = diag.get_score().round(4) * 100
    qual_score = qual.get_score().round(4) * 100

    report = {}
    report["Model"] = model
    report["Batch size/Epochs"] = f"{combination[0]}/{combination[1]}"
    report["Diagnostic Score"] = f"{diag_score: .2f}%"
    report["Data Validity"] = diag_props[diag_props['Property']
                                         == 'Data Validity']['Score'].values[0].round(2)
    report["Data Structure"] = diag_props[diag_props['Property']
                                          == 'Data Structure']['Score'].values[0].round(2)
    report["Evaluation Score"] = f"{qual_score: .2f}%"
    report["Column Shapes"] = eval_props[eval_props['Property']
                                         == 'Column Shapes']['Score'].values[0].round(2)
    report["Column Pair"] = eval_props[eval_props['Property']
                                       == 'Column Pair Trends']['Score'].values[0].round(2)
    report = pd.DataFrame(report, index=[0])

    report_df = pd.concat([report_df, report], ignore_index=True)

data_processor.save_data(df=report_df.to_csv(index=True),
                         name='report_df.csv', folder=results_folder)
report_df

## Generate results and upload to Azure Storage Explorer

In [None]:
def save_overlap_insight(bs, ep, ctgan_synth_data):

    col_overlap_name = f"bs-{bs}/epoch-{ep}/ctgan_column_overlap.csv"
    row_overlap_name = f"bs-{bs}/epoch-{ep}/ctgan_row_overlap.csv"

    # check for overlaps in data between synthetic and real data
    overlap_results_df = synth.identify_column_overlap(ctgan_synth_data)
    data_processor.save_data(df=overlap_results_df.to_csv(index=True),
                             name=col_overlap_name, folder=results_folder)

    row_overlap_results_df = synth.identify_row_overlap(ctgan_synth_data)
    data_processor.save_data(df=row_overlap_results_df.to_csv(index=True),
                             name=row_overlap_name, folder=results_folder)


def save_correlation_matrix(bs, ep, ctgan_synth_data):
    corr_name = f"bs-{bs}/epoch-{ep}/ctgan_corr"

    # correlation matrix
    ctgan_corr = synth.generate_corr_matrix(df=ctgan_synth_data)
    fig = synth.style_correlation_matrix(ctgan_corr)
    data_processor.save_data(fig.to_html(), name=f"{
                             corr_name}.html", folder=results_folder)
    data_processor.save_data(df=ctgan_corr.to_csv(index=True), name=f"{corr_name}.csv",
                             folder=results_folder)

    # correlation difference
    ctgan_corr_diff = real_corr - ctgan_corr
    fig = synth.style_correlation_matrix(ctgan_corr_diff)
    data_processor.save_data(fig.to_html(), name=f"{
                             corr_name}_diff.html", folder=results_folder)
    data_processor.save_data(df=ctgan_corr_diff.to_csv(index=True),
                             name=f'{corr_name}_diff.csv', folder=results_folder)


def visualize_data(bs, ep, ctgan_synth_data):
    fig_name = f"ctgan_figure_{bs}_{ep}.png"

    plot = synth.visualize_data(synthetic_data=ctgan_synth_data,
                                data_name='ctgan', column_names=column_names, fig_name=fig_name)

    with open(f'./data/figures/evaluations/ctgan/{fig_name}', 'rb') as p:
        data_processor.save_data(p, name=fig_name, folder=f"{
                                 results_folder}/bs-{bs}/epoch-{ep}")


def generate_stats(bs, ep, ctgan_synth_data):
    synth_stats = f"bs-{bs}/epoch-{ep}/ctgan_synth_stats.csv"

    columns = [c for c in synth.df.columns if c not in [
        'ESI_Key', 'RaterType', 'Gender', 'Race']]

    stats, rs, ss = data_processor.compare_datasets(real_data=real_data,
                                                    synthetic_data=ctgan_synth_data, columns=columns)
    data_processor.save_data(df=stats.to_csv(
        index=True), name=synth_stats, folder=results_folder)


def generate_demographics_stats(bs, ep, ctgan_synth_data):
    gender_stats_name = f"bs-{bs}/epoch-{ep}/ctgan_gender_stats.csv"
    race_stats_name = f"bs-{bs}/epoch-{ep}/ctgan_race_stats.csv"

    gender_stats = data_processor.compare_demographics(real_data=real_data,
                                                       synthetic_data=ctgan_synth_data,
                                                       column='Gender',
                                                       categories=['Male', 'Female'])
    data_processor.save_data(df=gender_stats.to_csv(index=True),
                             name=gender_stats_name, folder=results_folder)

    race_stats = data_processor.compare_demographics(real_data=real_data,
                                                     synthetic_data=ctgan_synth_data,
                                                     column='Race',
                                                     categories=['Caucasian', 'African American'])
    data_processor.save_data(df=race_stats.to_csv(index=True),
                             name=race_stats_name, folder=results_folder)


def visualize_evaluations(bs, ep, ctgan_synth_data):
    plot_list = synth.visualize_evaluation(synthetic_data=ctgan_synth_data,
                                           data_name='ctgan', demo_cols=data_processor.demo_cols)

    for plot in plot_list:
        with open(plot, 'rb') as p:
            plot_name = os.path.basename(plot)
            data_processor.save_data(p, name=plot_name, folder=f"{
                                     results_folder}/bs-{bs}/epoch-{ep}")

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.ioff() 

for name in models: 
    ## get batch and epochs from name
    bs, ep = get_params(name)
    
    figure_name = f'bs-{bs}/epoch-{ep}/ctgan_cumsum_figure.png'
    
    ## get the synthesizer
    ctgan_synthesizer = synth.train_ctgan_synthesizer(
        model_name=name)
    
    ## generate synthetic data
    synth_name = f"bs-{bs}/epoch-{ep}/ctgan_synth_data"
    ctgan_synth_data = synth.generate_synthetic_sample(ctgan_synthesizer, 15000)
    data_processor.save_data(df=ctgan_synth_data.to_csv(index=True), 
                             name=synth_name, folder=results_folder)
    
    ## save column and row overlap insights
    save_overlap_insight(bs, ep, ctgan_synth_data)
        
    ## save correlation matrix with correlation difference between real and synthetic data
    save_correlation_matrix(bs, ep, ctgan_synth_data)
    
    ## create visualization and upload to the storage
    visualize_data(bs, ep, ctgan_synth_data)
    
    ## generate and save stats
    generate_stats(bs, ep, ctgan_synth_data)
    
    ## generate and save demographics stats
    generate_demographics_stats(bs, ep, ctgan_synth_data)
    
    ## visualize evaluations
    visualize_evaluations(bs, ep, ctgan_synth_data)


In [None]:
ctgan_synthesizer = synth.train_ctgan_synthesizer(
    model_name='bmk2018_ctgan_hp_bs128_ep3000.pkl')

ctgan_synth_data = synth.generate_synthetic_sample(
    ctgan_synthesizer, 15000)

In [None]:
output = ctgan_synthesizer.get_loss_values()
output['Generator Loss'] = output['Generator Loss'].apply(lambda x: x.item())
output['Discriminator Loss'] = output['Discriminator Loss'].apply(
    lambda x: x.item())
output

In [None]:
import plotly.graph_objects as go


def get_loss_value_plot():
    # Graph the table
    fig = go.Figure(data=[go.Scatter(x=output['Epoch'], y=output['Generator Loss'], name='Generator Loss'),
                          go.Scatter(x=output['Epoch'], y=output['Discriminator Loss'], name='Discriminator Loss')])

    fig.update_layout(title='CTGAN Loss Values',
                      xaxis_title='Epoch', yaxis_title='Loss')

    return fig


fig = get_loss_value_plot()
fig.show()

In [None]:
import pandas as pd

output['Generator Loss'] = output['Generator Loss'].str.extract(
    '([-+]?\d*\.\d+|\d+)').astype(float)
output['Discriminator Loss'] = output['Discriminator Loss'].str.extract(
    '([-+]?\d*\.\d+|\d+)').astype(float)
output