In [7]:
get_available_demos('single_table')

Unnamed: 0,dataset_name,size_MB,num_tables
0,KRK_v1,0.06,1
1,adult,3.91,1
2,alarm,4.52,1
3,asia,1.28,1
4,census,98.17,1
5,census_extended,4.95,1
6,child,3.2,1
7,covtype,255.65,1
8,credit,68.35,1
9,expedia_hotel_logs,0.2,1


In [8]:
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.evaluation.single_table import evaluate_quality as evaluate_quality_single_table
from sdv.datasets.demo import download_demo, get_available_demos
import sdmetrics.column_pairs.base
import pandas as pd
import numpy as np
import plotly.express as px
import time
from sdmetrics.reports.utils import PlotConfig

result_metric = pd.DataFrame(columns=[
    'Try', 'Column 1', 'Column 2', 'Metric', 'Score', 'Time', 'Num Rows',
    'Real Correlation', 'Synthetic Correlation'
])
result_property = pd.DataFrame(columns=[
    'Try', 'Num Rows', 'Time SDV', 'Time CPT', 'Time QR', 'Score CPT', 'Score QR'
])
sdmetrics.column_pairs.base.DEFAULT_NUM_ROWS = None
data, metadata = download_demo('single_table', 'census')

total_rows = len(data)
start_time = time.process_time()
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data)
synthetic_data = synthesizer.sample(len(data))
end_time = time.process_time()
time_sdv = end_time - start_time

In [9]:
sdmetrics.column_pairs.base.DEFAULT_NUM_TRY = None
sdmetrics.column_pairs.base.DEFAULT_NUM_ROWS = None
report_baseline = evaluate_quality_single_table(data, synthetic_data, metadata, verbose=False)

In [10]:
baseline_values = pd.DataFrame({
    'Dataset': 'census',
    'Num Rows': len(data),
    'Time SDV': time_sdv,
    'Time QR': report_baseline.get_properties()['Time'].sum(),
    'Time CPT': report_baseline.get_properties().iloc[1]['Time'],
    'Score CPT': report_baseline.get_properties().iloc[1]['Score'],
    'Score QR': report_baseline.get_score(),
}, index=[0])
details_baseline = report_baseline.get_details('Column Pair Trends')
details_baseline = details_baseline.loc[details_baseline['Metric'] == 'ContingencySimilarity', ['Column 1', 'Column 2', 'Score', 'Time', 'Metric', 'Num Rows']]

In [14]:
num_tries = [1, 2, 3, 5, 10, 50]
num_rows = [1000, 2000, 3000, 5000, 10000] #, 30_000, 50_000, 100_000, 200_000, total_rows] #[1000, 5000] #, 10_000, 20_000, 30_000, 40_000, 50_000]

result_metric = pd.DataFrame(columns=[
    'Num Try', 'Column 1', 'Column 2', 'Metric', 'Score', 'Time', 'Num Rows',
    'Real Correlation', 'Synthetic Correlation'
])
result_property = pd.DataFrame(columns=[
    'Num Try', 'Num Rows', 'Time CPT', 'Score CPT', 'Time QR', 'Score QR'
])

for num_try in num_tries:
    sdmetrics.column_pairs.base.DEFAULT_NUM_TRY = num_try
    for num_row in num_rows:
        sdmetrics.column_pairs.base.DEFAULT_NUM_ROWS = num_row
        report = evaluate_quality_single_table(data, synthetic_data, metadata, verbose=False)
        result_values = pd.DataFrame({
            'Num Try': num_try,
            'Num Rows': num_row,
            'Time CPT': report.get_properties().iloc[1]['Time'],
            'Score CPT': report.get_properties().iloc[1]['Score'],
            'Time QR': report.get_properties()['Time'].sum(),
            'Score QR': report.get_score()
        }, index=[0])
        result_property = pd.concat([result_property, result_values])
        details = report.get_details('Column Pair Trends')
        details = details.loc[details['Metric'] == 'ContingencySimilarity', ['Column 1', 'Column 2', 'Score', 'Time', 'Metric', 'Num Rows']]
        details['Num Try'] = num_try
        result_metric = pd.concat([result_metric, details])


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



In [15]:
diff_result = result_property.copy()
diff_result['Ratio Time with SDV'] = 0.
diff_result['Score Fluctuation QR'] = 0.
diff_result['Score Fluctuation CPT'] = 0.
for num_try in num_tries:
    for n_rows in num_rows:
        condition = (diff_result['Num Rows'] == n_rows) & (diff_result['Num Try'] == num_try)
        diff_result.loc[condition, 'Ratio Time with SDV'] = result_property.loc[condition, 'Time QR'] / baseline_values['Time SDV']
        diff_result.loc[condition, 'Score Fluctuation QR'] = np.abs(baseline_values['Score QR'] - result_property.loc[condition, 'Score QR'])
        diff_result.loc[condition, 'Score Fluctuation CPT'] = np.abs(baseline_values['Score CPT'] - result_property.loc[condition, 'Score CPT'])

unique_num_rows = sorted(num_rows)
num_rows_mapping = {value: idx for idx, value in enumerate(unique_num_rows)}
unique_num_tries = sorted(num_tries)
num_tries_mapping = {value: idx for idx, value in enumerate(unique_num_tries)}
diff_result['Num Rows Normalized'] = diff_result['Num Rows'].map(num_rows_mapping)
diff_result['Num Try Normalized'] = diff_result['Num Try'].map(num_tries_mapping)
ratio_baseline = (baseline_values['Time QR'] / baseline_values['Time SDV']).values[0]

diff_metric = result_metric.copy()
for num_try in num_tries:
    for n_rows in num_rows:
        condition = (diff_metric['Num Rows'] == n_rows) & (diff_metric['Num Try'] == num_try)
        diff_metric.loc[condition, ['Score', 'Time']] = np.abs((result_metric.loc[condition, ['Score', 'Time']] - details_baseline[['Score', 'Time']]).values)

heatmap_max_diff = diff_metric.groupby(['Num Rows', 'Num Try']).max()['Score'].reset_index().pivot(columns='Num Rows', index='Num Try', values='Score')
heatmap_max_diff = heatmap_max_diff.sort_index(ascending=False).reset_index(drop=True)
heatmap_mean_diff = diff_metric[['Num Rows', 'Num Try', 'Score']].groupby(['Num Rows', 'Num Try']).mean()['Score'].reset_index().pivot(columns='Num Rows', index='Num Try', values='Score')
heatmap_mean_diff = heatmap_mean_diff.sort_index(ascending=False).reset_index(drop=True)
heatmap_mean_diff = heatmap_mean_diff.rename(columns=num_rows_mapping)
heatmap_max_diff = heatmap_max_diff.rename(columns=num_rows_mapping)

heatmap_qr = diff_result.pivot(columns='Num Rows Normalized', index='Num Try Normalized', values='Score Fluctuation QR')
heatmap_qr = heatmap_qr.sort_index(ascending=False)
heatmap_qr.reset_index(drop=True, inplace=True)

heatmap_data = diff_result.pivot(columns='Num Rows Normalized', index='Num Try Normalized', values='Ratio Time with SDV')
heatmap_data = heatmap_data.sort_index(ascending=False)
heatmap_data.reset_index(drop=True, inplace=True)

In [18]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

heatmap_data = diff_result.pivot(columns='Num Rows Normalized', index='Num Try Normalized', values='Ratio Time with SDV')
heatmap_data = heatmap_data.sort_index(ascending=False)
heatmap_data.reset_index(drop=True, inplace=True)

title = f'dataset: census, #rows={total_rows}, #columns={len(data.columns)}, <br> time SDV: {np.round(time_sdv)}s, ratio baseline: {ratio_baseline:.2f}'

fig_1 = px.imshow(
    heatmap_data,
    title=title,
    aspect="auto",
    labels=dict(color="Ratio"),
)
fig_1.update_layout(
    coloraxis={'colorscale': [PlotConfig.DATACEBO_GREEN, PlotConfig.DATACEBO_DARK]},
    font={'size': PlotConfig.FONT_SIZE},
    xaxis=dict(
        title='Num Rows',
        tickmode='array',
        tickvals=list(range(len(num_rows_mapping))),
        ticktext=[str(val) for val in num_rows_mapping.keys()],
    ),
    yaxis=dict(
        title='Num Iteration',
        tickmode='array',
        tickvals=list(reversed(range(len(num_tries_mapping)))),
        ticktext=[str(val) for val in (num_tries_mapping.keys())],
    )
)

fig_1.show()

fig_combined = make_subplots(
    rows=1, 
    cols=3, 
    shared_yaxes=True,
    subplot_titles=(
        "Metric Max difference",
        "Metric Mean difference",
        "QR Score difference"
    )
)
fig_combined.update_yaxes(autorange='reversed')

# Prepare x and y tick values/text
x_tickvals = list(range(len(num_rows_mapping)))
x_ticktext = [str(val) for val in num_rows_mapping.keys()]

y_tickvals = list(reversed(range(len(num_tries_mapping))))
y_ticktext = [str(val) for val in num_tries_mapping.keys()]

# Add the heatmap for the Max difference
fig_combined.add_trace(
    go.Heatmap(
        z=heatmap_max_diff.values,
        x=x_tickvals,
        y=y_tickvals,
        coloraxis="coloraxis1"
    ),
    row=1, col=1
)

# Add the heatmap for the Mean difference
fig_combined.add_trace(
    go.Heatmap(
        z=heatmap_mean_diff.values,
        x=x_tickvals,
        y=y_tickvals,
        coloraxis="coloraxis1"
    ),
    row=1, col=2
)

# Add the heatmap for the QR Score difference
fig_combined.add_trace(
    go.Heatmap(
        z=heatmap_qr.values,
        x=x_tickvals,
        y=y_tickvals,
        coloraxis="coloraxis1"
    ),
    row=1, col=3
)

# Update layout for shared coloraxis and fonts
fig_combined.update_layout(
    coloraxis={'colorscale': [PlotConfig.DATACEBO_GREEN, PlotConfig.DATACEBO_DARK]},
    font={'size': PlotConfig.FONT_SIZE},
)

# Update X and Y axes
fig_combined.update_xaxes(
    title_text='Num Rows',
    tickmode='array',
    tickvals=x_tickvals,
    ticktext=x_ticktext,
    row=1, col=1
)

fig_combined.update_xaxes(
    tickmode='array',
    tickvals=x_tickvals,
    ticktext=x_ticktext,
    title_text='Num Rows',
    row=1, col=2
)

fig_combined.update_xaxes(
    tickmode='array',
    tickvals=x_tickvals,
    ticktext=x_ticktext,
    title_text='Num Rows',
    row=1, col=3
)

# Since y-axis is shared, set it only once
fig_combined.update_yaxes(
    title_text='Num Iteration',
    tickmode='array',
    tickvals=y_tickvals,
    ticktext=y_ticktext,
    row=1, col=1
)

fig_combined.show()


In [None]:
for column in self.get_output_columns():
    if column == self.get_input_column():
        self._utc_date_column = column
    

In [222]:
diff_result

Unnamed: 0,Num Try,Num Rows,Time CPT,Score CPT,Time QR,Score QR,Ratio Time with SDV,Relative Score Fluctuation QR,Relative Score Fluctuation CPT,Num Rows Normalized,Num Try Normalized
0,1,1000,0.392854,0.750844,0.553073,0.81354,0.096331,1.669036,3.547703,0,0
0,1,2000,0.448718,0.761589,0.647192,0.818913,0.112724,1.019707,2.167489,1,0
0,1,3000,0.421387,0.767552,0.566433,0.821894,0.098658,0.659341,1.401496,2,0
0,1,5000,0.487563,0.77064,0.633515,0.823438,0.110341,0.472712,1.004797,3,0
0,1,10000,0.587624,0.775443,0.732226,0.82584,0.127534,0.182428,0.387769,4,0
0,2,1000,0.510192,0.750589,0.654533,0.813413,0.114002,1.684481,3.580531,0,1
0,2,2000,0.623396,0.762978,0.774258,0.819607,0.134855,0.935771,1.989074,1,1
0,2,3000,0.587847,0.768137,0.733253,0.822187,0.127713,0.623976,1.326324,2,1
0,2,5000,0.711087,0.771587,0.857111,0.823912,0.149286,0.415502,0.88319,3,1
0,2,10000,0.908475,0.774531,1.053658,0.825384,0.183519,0.237557,0.504951,4,1


In [218]:
heatmap_max_diff

Num Rows,0,1,2,3,4
0,0.087682,0.065329,0.050496,0.034545,0.022529
1,0.099829,0.059329,0.049871,0.043829,0.021029
2,0.113633,0.067133,0.042496,0.027303,0.020129
3,0.106829,0.063703,0.055163,0.033629,0.020729
4,0.093829,0.075979,0.064829,0.052029,0.019728


In [182]:
heatmap_data = diff_result.pivot(columns='Num Rows Normalized', index='Num Try Normalized', values='Relative Score Fluctuation QR')
heatmap_data = heatmap_data.sort_index(ascending=False)
heatmap_data.reset_index(drop=True, inplace=True)

title = f'dataset: #tables: 1, #rows={total_rows}, #columns={len(data.columns)}, ratio baseline: {ratio_baseline:.2f}'

fig = px.imshow(
    heatmap_data,
    title=title,
    aspect="auto",
    labels=dict(color="QR Score Variation (%)"),
)
fig.update_layout(
    coloraxis={'colorscale': [PlotConfig.DATACEBO_DARK, PlotConfig.DATACEBO_GREEN]},
    font={'size': PlotConfig.FONT_SIZE},
    xaxis=dict(
        title='Num Rows',
        tickmode='array',
        tickvals=list(range(len(num_rows_mapping))),
        ticktext=[str(val) for val in num_rows_mapping.keys()],
    ),
    yaxis=dict(
        title='Num Try',
        tickmode='array',
        tickvals=list(reversed(range(len(num_tries_mapping)))),
        ticktext=[str(val) for val in (num_tries_mapping.keys())],
    )
)

fig.show()

In [None]:
fig.update_layout(
    title_text='Num Rows vs Num Try vs Relative Score Fluctuation QR',
    coloraxis={'colorscale': [PlotConfig.DATACEBO_DARK, PlotConfig.DATACEBO_GREEN]},
    font={'size': PlotConfig.FONT_SIZE},
    xaxis=dict(
        title='Num Rows',
        tickmode='array',
        tickvals=list(range(len(num_rows_mapping))),
        ticktext=[str(val) for val in num_rows_mapping.keys()],
    ),
    yaxis=dict(
        title='Num Try',
        tickmode='array',
        tickvals=list(reversed(range(len(num_tries_mapping)))),
        ticktext=[str(val) for val in (num_tries_mapping.keys())],
    ),
)

In [92]:
list((range(len(num_tries_mapping))))

[0, 1, 2, 3, 4]

In [93]:
[str(val) for val in reversed(num_tries_mapping.keys())]

['10', '5', '3', '2', '1']

In [25]:
from sdmetrics.reports.utils import PlotConfig

fig = px.density_heatmap(
    diff_result, x='Num Rows', y='Num Try', z='Relative Score Fluctuation QR',
)
fig.update_layout(
    title_text='Num Rows vs Num Try vs Relative Score Fluctuation QR',
    coloraxis={'colorscale': [PlotConfig.DATACEBO_DARK, PlotConfig.DATACEBO_GREEN]},
    font={'size': PlotConfig.FONT_SIZE},
)

fig.show()

In [None]:
def _generate_heatmap_plot(all_data, columns):
    """Generate heatmap plot for discrete data.

    Args:
        all_data (pandas.DataFrame):
            The real and synthetic data for the desired column pair containing a
            ``Data`` column that indicates whether is real or synthetic.
        columns (list):
            A list of the columns being plotted.

    Returns:
        plotly.graph_objects._figure.Figure
    """
    unique_values = all_data['Data'].unique()

    if len(columns) != 2:
        raise ValueError('Generating a heatmap plot requires exactly two columns for the axis.')

    fig = px.density_heatmap(
        all_data, x=columns[0], y=columns[1], facet_col='Data', histnorm='probability'
    )

    title = ' vs. '.join(unique_values)
    title += f" Data for columns '{columns[0]}' and '{columns[1]}'"

    fig.update_layout(
        title_text=title,
        coloraxis={'colorscale': [PlotConfig.DATACEBO_DARK, PlotConfig.DATACEBO_GREEN]},
        font={'size': PlotConfig.FONT_SIZE},
    )

    fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1] + ' Data'))

    return fig


In [None]:
unique_num_rows = sorted(num_rows)
num_rows_mapping = {value: idx for idx, value in enumerate(unique_num_rows)}
diff_result['Num Rows Normalized'] = diff_result['Num Rows'].map(num_rows_mapping)
result_average = result_metric.groupby(['Column 1', 'Column 2', 'Num Rows'])['Score'].mean().reset_index()
diff_metric = result_metric.copy()
for n_try in range(num_try):
    for n_row in num_rows:
        diff_metric.loc[(diff_metric['Num Rows'] == n_row) & (diff_metric['Try'] == n_try) , ['Score', 'Time']] = (result_metric.loc[(diff_metric['Num Rows'] == n_row) & (diff_metric['Try'] == n_try), ['Score', 'Time']] - details_baseline[['Score', 'Time']]).values