In [1]:
from sdv.datasets.demo import download_demo, get_available_demos
from sdv.single_table import GaussianCopulaSynthesizer
from rdt.transformers import FloatFormatter, UnixTimestampEncoder
from sdmetrics.reports.single_table import QualityReport
import pandas as pd
import time

In [2]:
demos = get_available_demos('single_table').sort_values('size_MB')

In [3]:
demos

Unnamed: 0,dataset_name,size_MB,num_tables
10,fake_companies,0.0,1
11,fake_hotel_guests,0.03,1
20,student_placements,0.03,1
21,student_placements_pii,0.03,1
0,KRK_v1,0.07,1
9,expedia_hotel_logs,0.2,1
19,ring,0.32,1
13,gridr,0.32,1
12,grid,0.32,1
3,asia,1.28,1


In [4]:
data, metadata = download_demo('single_table', 'student_placements')

In [5]:
metadata.columns

{'start_date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
 'end_date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
 'salary': {'sdtype': 'numerical', 'computer_representation': 'Int64'},
 'duration': {'sdtype': 'categorical'},
 'student_id': {'sdtype': 'id', 'regex_format': '\\d{30}'},
 'high_perc': {'sdtype': 'numerical', 'computer_representation': 'Float'},
 'high_spec': {'sdtype': 'categorical'},
 'mba_spec': {'sdtype': 'categorical'},
 'second_perc': {'sdtype': 'numerical', 'computer_representation': 'Float'},
 'gender': {'sdtype': 'categorical'},
 'degree_perc': {'sdtype': 'numerical', 'computer_representation': 'Float'},
 'placed': {'sdtype': 'boolean'},
 'experience_years': {'sdtype': 'numerical',
  'computer_representation': 'Float'},
 'employability_perc': {'sdtype': 'numerical',
  'computer_representation': 'Float'},
 'mba_perc': {'sdtype': 'numerical', 'computer_representation': 'Float'},
 'work_experience': {'sdtype': 'boolean'},
 'degree_type': {'sdt

## Test over one dataset

In [31]:
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer._data_processor.fit(data)
synthesizer.get_transformers()['amenities_fee'].missing_value_replacement

'mean'

In [45]:
data, metadata = download_demo('single_table', demos.iloc[1]['dataset_name'])
metadata_dict = metadata.to_dict()


synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer._data_processor.fit(data)
column_name_to_transformer = {}
for column_name in metadata_dict['columns']:
    sdtype = metadata_dict['columns'][column_name]['sdtype']
    if sdtype == 'numerical':
        column_name_to_transformer[column_name] = FloatFormatter('mean')
        print(synthesizer.get_transformers()[column_name].missing_value_replacement)
    elif sdtype == 'datetime':
        column_name_to_transformer[column_name] = UnixTimestampEncoder('mean')

#synthesizer.update_transformers(column_name_to_transformer)

synthesizer.fit(data)
s = synthesizer._sample(len(data))
s_2 = synthesizer._data_processor.reverse_transform(s)
#synthetic_data_after = synthesizer.sample(len(data))


random
random


In [46]:
metadata_dict

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'guest_email': {'sdtype': 'email', 'pii': True},
  'has_rewards': {'sdtype': 'boolean'},
  'room_type': {'sdtype': 'categorical'},
  'amenities_fee': {'sdtype': 'numerical', 'computer_representation': 'Float'},
  'checkin_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'},
  'checkout_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'},
  'room_rate': {'sdtype': 'numerical', 'computer_representation': 'Float'},
  'billing_address': {'sdtype': 'address', 'pii': True},
  'credit_card_number': {'sdtype': 'credit_card_number', 'pii': True}},
 'primary_key': 'guest_email'}

In [41]:
synthesizer.get_transformers()

{'guest_email': AnonymizedFaker(provider_name='internet', function_name='email', enforce_uniqueness=True),
 'has_rewards': UniformEncoder(),
 'room_type': UniformEncoder(),
 'amenities_fee': FloatFormatter(),
 'checkin_date': UnixTimestampEncoder(datetime_format='%d %b %Y'),
 'checkout_date': UnixTimestampEncoder(datetime_format='%d %b %Y'),
 'room_rate': FloatFormatter(),
 'billing_address': AnonymizedFaker(provider_name='address', function_name='address'),
 'credit_card_number': AnonymizedFaker(provider_name='credit_card', function_name='credit_card_number')}

In [44]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "Value": {
            "sdtype": "numerical"
        },
        "Value_2": {
            "sdtype": "numerical"
        }
    }
}

In [50]:
metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'Value': {'sdtype': 'numerical', 'computer_representation': 'Float'},
    }
})

data = np.random.randn(10000)
nan_indices = np.random.choice(10000, int(1000), replace=False)
data[nan_indices] = np.nan
real_data = pd.DataFrame({
    'Value': data
})

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_data)
print(synthesizer.get_transformers()['Value'].missing_value_replacement)
synthetic_data = synthesizer.sample(10000)

mean


In [48]:
synthesizer.get_transformers()

{'Value': FloatFormatter(), 'Value_2': FloatFormatter()}

In [34]:
synthesizer.get_transformers()['amenities_fee'].missing_value_replacement

'random'

In [9]:
demo_datasets = get_available_demos('single_table').sort_values('size_MB')['dataset_name']

In [11]:
result_columns = pd.DataFrame(columns=[
    'dataset', 'column_name',
    '% missinge values',
    '% missing values before', '% missing values after',
    'KSComplement before', 'KSComplement after'
])

result_dataset = pd.DataFrame(columns=[
    'dataset', 'num column', 'num numerical+datetime columns',
    'num numerical+datetime columns with NaN',
    'OQS before', 'OQS after',
    'Column Shapes before', 'Column Shapes after',
    'Column Pair Trends before', 'Column Pair Trends after',
])

for dataset in demo_datasets.iloc[1:2]:

    if dataset in ['expedia_hotel_logs', 'census_extended']:
        continue

    data, metadata = download_demo('single_table', dataset)
    metadata_dict = metadata.to_dict()

    synthesizer_after = GaussianCopulaSynthesizer(metadata)
    time_start_after = time.time()
    synthesizer_after.fit(data)
    synthetic_data_after = synthesizer_after.sample(len(data))
    time_after = time.time() - time_start_after

    missing_value_replacement = synthesizer_after.get_transformers()['amenities_fee'].missing_value_replacement
    print(f'missing_value_replacement: {missing_value_replacement}')

    synthesizer_before = GaussianCopulaSynthesizer(metadata)
    synthesizer_before._data_processor.fit(data)
    column_name_to_transformer = {}
    for column_name in metadata_dict['columns']:
        sdtype = metadata_dict['columns'][column_name]['sdtype']
        if sdtype == 'numerical':
            column_name_to_transformer[column_name] = FloatFormatter('mean')
        elif sdtype == 'datetime':
            column_name_to_transformer[column_name] = UnixTimestampEncoder('mean')

    synthesizer_before.update_transformers(column_name_to_transformer)

    time_start_before = time.time()
    synthesizer_before.fit(data)
    synthetic_data_before = synthesizer_after.sample(len(data))
    time_before = time.time() - time_start_before

    missing_value_replacement = synthesizer_before.get_transformers()['amenities_fee'].missing_value_replacement
    print(f'missing_value_replacement: {missing_value_replacement}')

    report_before = QualityReport()
    report_before.generate(data, synthetic_data_before, metadata_dict, verbose=False)


    report_after = QualityReport()
    report_after.generate(data, synthetic_data_after, metadata_dict, verbose=False)

    details_before = report_before.get_details('Column Shapes')
    column_result_before = details_before.loc[details_before['Metric']=='KSComplement']
    details_after = report_after.get_details('Column Shapes')
    column_result_after = details_after.loc[details_after['Metric']=='KSComplement']

    num_col_with_nan = 0
    for column_name in column_result_before['Column']:
        nan_proportion = 100 * data[column_name].isna().sum()/len(data)
        if nan_proportion > 0:
            num_col_with_nan += 1

        nan_proportion_before = 100 * synthetic_data_before[column_name].isna().sum()/len(synthetic_data_before)
        nan_proportion_after = 100 * synthetic_data_after[column_name].isna().sum()/len(synthetic_data_after)
        ksc_before = column_result_before.loc[column_result_before['Column']==column_name, 'Score'].squeeze()
        ksc_after = column_result_after.loc[column_result_after['Column']==column_name, 'Score'].squeeze()
        column_result = pd.DataFrame({
            'dataset': dataset,
            'column_name': column_name,
            '% missinge values': nan_proportion,
            '% missing values before': nan_proportion_before,
            '% missing values after': nan_proportion_after,
            'KSComplement before': ksc_before,
            'KSComplement after': ksc_after,
        }, index=[0])

        result_columns = pd.concat([result_columns, column_result], ignore_index=True)

    dataset_result = pd.DataFrame({
        'dataset': dataset,
        'num column': len(metadata_dict['columns']),
        'num numerical+datetime columns': len(column_name_to_transformer),
        'num numerical+datetime columns with NaN': num_col_with_nan,
        'Time fit+sample before [s]': time_before,
        'Time fit+sample after [s]': time_after,
        'OQS before': report_before.get_score(),
        'OQS after': report_after.get_score(),
        'Column Shapes before': report_before.get_properties().iloc[0]['Score'],
        'Column Shapes after': report_after.get_properties().iloc[0]['Score'],
        'Column Pair Trends before': report_before.get_properties().iloc[0]['Score'],
        'Column Pair Trends after': report_after.get_properties().iloc[0]['Score'],
    }, index=[0])

    result_dataset = pd.concat([result_dataset, dataset_result], ignore_index=True)

    result_dataset.to_csv('result_dataset_2.csv', index=False)
    result_columns.to_csv('result_columns_2.csv', index=False)

missing_value_replacement: random
missing_value_replacement: mean



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



In [None]:
result

In [56]:
result_columns

Unnamed: 0,dataset,column_name,% missinge values,% missing values before,% missing values after,KSComplement before,KSComplement after
0,fake_companies,employee_id,0.0,0.0,0.0,0.583333,0.833333
1,fake_companies,age,0.0,0.0,0.0,0.916667,0.666667
2,fake_companies,age_when_joined,0.0,0.0,0.0,0.750000,0.750000
3,fake_companies,years_in_the_company,0.0,0.0,0.0,0.583333,0.666667
4,fake_companies,salary,0.0,0.0,0.0,0.583333,0.750000
...,...,...,...,...,...,...,...
1166,mnist28,2723,0.0,0.0,0.0,1.000000,1.000000
1167,mnist28,2724,0.0,0.0,0.0,1.000000,1.000000
1168,mnist28,2725,0.0,0.0,0.0,1.000000,1.000000
1169,mnist28,2726,0.0,0.0,0.0,1.000000,1.000000


In [53]:
result_dataset

Unnamed: 0,dataset,num column,num numerical+datetime columns,num numerical+datetime columns with NaN,OQS before,OQS after,Column Shapes before,Column Shapes after,Column Pair Trends before,Column Pair Trends after,Time fit+sample before [s],Time fit+sample after [s]
0,fake_companies,12,10,0,0.66441,0.713846,0.715278,0.756944,0.715278,0.756944,0.27629,0.273022
1,fake_hotel_guests,9,4,2,0.898519,0.901244,0.913601,0.908879,0.913601,0.908879,0.309128,0.322158
2,student_placements,17,9,3,0.852057,0.823763,0.869914,0.867063,0.869914,0.867063,0.354468,0.380128
3,student_placements_pii,18,9,3,0.852057,0.823763,0.869914,0.867063,0.869914,0.867063,0.380487,0.39776
4,KRK_v1,9,7,0,0.94453,0.947837,0.933375,0.93925,0.933375,0.93925,0.204559,0.218409
5,ring,2,2,0,0.9122,0.9122,0.9122,0.9122,0.9122,0.9122,0.32372,0.344128
6,gridr,2,2,0,0.881173,0.880116,0.80285,0.80375,0.80285,0.80375,0.679876,0.698697
7,grid,2,2,0,0.942311,0.941341,0.8917,0.889175,0.8917,0.889175,0.607013,0.61594
8,asia,8,0,0,0.981507,0.981426,0.998094,0.998156,0.998094,0.998156,1.925657,1.808841
9,child,20,0,0,0.967956,0.967249,0.996531,0.995659,0.996531,0.995659,5.057913,5.635183


In [36]:
column_result

Unnamed: 0,dataset,column_name,% missinge values,% missing values before,% missing values after,KSComplement before,KSComplement after
0,KRK_v1,white_rook_file,0.0,0.0,0.0,,


In [13]:
synthesizer.get_transformers()

ValueError: No transformers were returned in 'get_transformers'. Use 'auto_assign_transformers' or 'fit' to create them.

In [7]:
synthetic_data

Unnamed: 0,white_king_rank,black_king_file,black_king_rank,add_numerical,white_king_file,class,id,white_rook_rank,white_rook_file
0,4,6,2,892,5,legal,0,2,2
1,3,5,0,409,6,illegal,1,5,5
2,2,0,4,821,6,legal,2,2,4
3,1,3,3,1007,6,legal,3,6,6
4,7,3,3,728,5,legal,4,5,6
...,...,...,...,...,...,...,...,...,...
995,4,3,3,935,3,legal,995,3,1
996,2,4,1,341,3,illegal,996,2,2
997,7,6,6,451,6,legal,997,5,5
998,4,6,1,77,1,illegal,998,2,7


## Experiment B

In [82]:
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdmetrics.reports import utils
import pandas as pd
from rdt.transformers import FloatFormatter
from sdmetrics.single_column import KSComplement

## Gaussian

In [110]:
metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'Value': {'sdtype': 'numerical'}
    }
})

nan_proportions = [0.05, 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 0.95]
n_rows = int(1e5)

figures = []
kscomplement = []
for nan_proportion in nan_proportions:
    data = np.random.randn(n_rows)
    nan_indices = np.random.choice(n_rows, int(nan_proportion * n_rows), replace=False)
    data[nan_indices] = np.nan
    real_data = pd.DataFrame(data, columns=['Value'])

    synthesizer = GaussianCopulaSynthesizer(metadata)
    synthesizer.fit(real_data)
    synthetic_data = synthesizer.sample(n_rows)

    metric_score = KSComplement.compute(
        real_data=real_data['Value'],
        synthetic_data=synthetic_data['Value']
    )
    kscomplement.append(metric_score)

    fig = utils.get_column_plot(
        real_data=real_data,
        synthetic_data=synthetic_data,
        column_name='Value',
        metadata=metadata.to_dict(),
    )
    fig.update_layout(
        title=f'KSComplement: {metric_score:.2f}',
    )

    figures.append(fig)


In [111]:
kscomplement_before = kscomplement
figures_before = figures

In [108]:
kscomplement_after = kscomplement
figures_after = figures

In [109]:
for figure in figures_after:
    figure.show()

In [112]:
for figure in figures_before:
    figure.show()

In [79]:
result_exp_b = pd.DataFrame({
    'nan_proportion': nan_proportions,
    'KSComplement before': kscomplement_before,
    'KSComplement after': kscomplement_after,
})

result_exp_b['Distribution'] = 'Gaussian'
result_exp_b

## Beta

In [100]:
metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'Value': {'sdtype': 'numerical', 'computer_representation': 'Float'}
    }
})

nan_proportions = [0.05, 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 0.95]
n_rows = int(1e5)

figures = []
kscomplement = []
for nan_proportion in nan_proportions:
    data = np.random.beta(2, 5, n_rows)
    nan_indices = np.random.choice(n_rows, int(nan_proportion * n_rows), replace=False)
    data[nan_indices] = np.nan
    real_data = pd.DataFrame(data, columns=['Value'])

    synthesizer = GaussianCopulaSynthesizer(metadata)
    synthesizer.fit(real_data)
    synthetic_data = synthesizer.sample(n_rows)

    metric_score = KSComplement.compute(
        real_data=real_data['Value'],
        synthetic_data=synthetic_data['Value']
    )
    kscomplement.append(metric_score)

    fig = utils.get_column_plot(
        real_data=real_data,
        synthetic_data=synthetic_data,
        column_name='Value',
        metadata=metadata.to_dict(),
    )
    fig.update_layout(
        title=f'KSComplement: {metric_score:.2f}',
    )

    figures.append(fig)


In [96]:
kscomplement_beta_before = kscomplement
figures_beta_before = figures

In [101]:
kscomplement_beta_after = kscomplement
figures_beta_after = figures

In [94]:
result_exp_b_beta = pd.DataFrame({
    'nan_proportion': nan_proportions,
    'KSComplement before': kscomplement_beta_before,
    'KSComplement after': kscomplement_beta_after,
})

result_exp_b_beta['Distribution'] = 'Beta'
result_exp_b_beta

Unnamed: 0,nan_proportion,KSComplement before,KSComplement after,Distribution
0,0.05,0.989443,0.978355,Beta
1,0.1,0.977886,0.955801,Beta
2,0.2,0.953285,0.921431,Beta
3,0.4,0.917031,0.840007,Beta
4,0.5,0.897954,0.781455,Beta
5,0.7,0.809804,0.721755,Beta
6,0.9,0.683562,0.695936,Beta
7,0.95,0.706356,0.680647,Beta


In [99]:
for figure in figures_beta_before:
    figure.show()

In [103]:
for figure in figures_beta_after:
    figure.show()

## Uniform

In [118]:
metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'Value': {'sdtype': 'numerical'}
    }
})

nan_proportions = [0.05, 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 0.95]
n_rows = int(1e5)

figures = []
kscomplement = []
for nan_proportion in nan_proportions:
    data = np.random.rand(n_rows)
    nan_indices = np.random.choice(n_rows, int(nan_proportion * n_rows), replace=False)
    data[nan_indices] = np.nan
    real_data = pd.DataFrame(data, columns=['Value'])

    synthesizer = GaussianCopulaSynthesizer(metadata)
    synthesizer.fit(real_data)
    synthetic_data = synthesizer.sample(n_rows)

    metric_score = KSComplement.compute(
        real_data=real_data['Value'],
        synthetic_data=synthetic_data['Value']
    )
    kscomplement.append(metric_score)

    fig = utils.get_column_plot(
        real_data=real_data,
        synthetic_data=synthetic_data,
        column_name='Value',
        metadata=metadata.to_dict(),
    )
    fig.update_layout(
        title=f'KSComplement: {metric_score:.2f}',
    )

    figures.append(fig)


In [116]:
kscomplement_uniform_after = kscomplement
figures_uniform_after = figures

In [119]:
kscomplement_uniform_before = kscomplement
figures_uniform_before = figures

In [None]:
result_exp_b_beta = pd.DataFrame({
    'nan_proportion': nan_proportions,
    'KSComplement before': kscomplement_beta_before,
    'KSComplement after': kscomplement_beta_after,
})

result_exp_b_beta['Distribution'] = 'Beta'
result_exp_b_beta

In [117]:
for figure in figures_uniform_after:
    figure.show()

In [120]:
for figure in figures_uniform_before:
    figure.show()

## Highly skewed distribution

In [130]:
metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'Value': {'sdtype': 'numerical', 'computer_representation': 'Float'}
    }
})

nan_proportions = [0.05, 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 0.95]
n_rows = int(1e5)

figures = []
kscomplement = []
for nan_proportion in nan_proportions:
    data = np.random.beta(1 ,4, n_rows)
    nan_indices = np.random.choice(n_rows, int(nan_proportion * n_rows), replace=False)
    data[nan_indices] = np.nan
    real_data = pd.DataFrame(data, columns=['Value'])

    synthesizer = GaussianCopulaSynthesizer(metadata)
    synthesizer.fit(real_data)
    synthetic_data = synthesizer.sample(n_rows)

    metric_score = KSComplement.compute(
        real_data=real_data['Value'],
        synthetic_data=synthetic_data['Value']
    )
    kscomplement.append(metric_score)

    fig = utils.get_column_plot(
        real_data=real_data,
        synthetic_data=synthetic_data,
        column_name='Value',
        metadata=metadata.to_dict(),
    )
    fig.update_layout(
        title=f'KSComplement: {metric_score:.2f}',
    )

    figures.append(fig)


In [131]:
kscomplement_beta_skewed_after = kscomplement
figures_beta_skewed_after = figures

In [128]:
kscomplement_beta_skewed_before = kscomplement
figures_beta_skewed_before = figures

In [129]:
for figure in figures_beta_skewed_before:
    figure.show()

In [132]:
for figure in figures_beta_skewed_after:
    figure.show()

## Others

In [13]:
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer._data_processor.fit(real_data)
#column_name_to_transformer = {'Value': FloatFormatter()}
#synthesizer.update_transformers(column_name_to_transformer)

synthesizer.fit(real_data)
synthetic_data = synthesizer.sample(n_rows)

missing_value_replacement = synthesizer.get_transformers()['Value'].missing_value_replacement
print(f'Missing value replacement: {missing_value_replacement}')

fig = utils.get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='Value',
    metadata=metadata.to_dict(),
)

fig.show()

Missing value replacement: mean


In [None]:
fig = utils.get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='Value',
    metadata=metadata.to_dict(),
)

fig.show()

In [133]:
from scipy.stats import beta, uniform
import numpy as np

true_min = 18.28
true_max = 46.33

loc = -78.23275036821482
scale = 2.3433513325141828e-30
a = 2170684.123605152
b = 84151.30493077749

synthetic_data = beta.rvs(a, b, loc=loc, scale=scale, size=10_000)
real_data = uniform.rvs(loc=true_min, scale=(true_max-true_min), size=10_000)

In [152]:
min(synthetic_data)

-78.23275036821482

In [134]:
max(synthetic_data)

-78.23275036821482

In [135]:
import pandas as pd

data = pd.read_csv('sdv_issue_1592.csv')
data.head()

Unnamed: 0,var1
0,27.26
1,27.57
2,33.83
3,28.43
4,38.72


In [141]:
from sdv.metadata import SingleTableMetadata



In [157]:
from sdv.single_table import GaussianCopulaSynthesizer

metadata = SingleTableMetadata.load_from_dict({
    'columns': {
        'var1': { 'sdtype': 'numerical', 'computer_representation': 'Float' },
    }
})
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data)
synthetic_data_before = synthesizer.sample(len(data))

In [158]:
synthesizer.get_transformers()

{'var1': FloatFormatter()}

In [154]:
synthesizer.get_learned_distributions()

{'var1': {'distribution': 'beta',
  'learned_parameters': {'loc': -78.23275036821482,
   'scale': 2.3433513325141828e-30,
   'a': 2170684.123544158,
   'b': 84151.30493162549}}}

In [151]:
synthetic_data_before

Unnamed: 0,var1
0,
1,18.28
2,
3,18.28
4,18.28
...,...
1453,18.28
1454,18.28
1455,
1456,


In [149]:
fig = utils.get_column_plot(
        real_data=data,
        synthetic_data=synthetic_data_before,
        column_name='var1',
        metadata=metadata.to_dict(),
    )
fig.show()

In [147]:
fig = utils.get_column_plot(
        real_data=data,
        synthetic_data=synthetic_data_after,
        column_name='var1',
        metadata=metadata.to_dict(),
    )
fig.show()

In [9]:
def get_synthetic_data_and_time(metatada, data, modality):

    synthesizer = GaussianCopulaSynthesizer(metadata)
    synthesizer._data_processor.fit(data)
    column_name_to_transformer = {}
    for column_name in metadata_dict['columns']:
        sdtype = metadata_dict['columns'][column_name]['sdtype']
        if sdtype == 'numerical':
            column_name_to_transformer[column_name] = FloatFormatter(modality)
        elif sdtype == 'datetime':
            column_name_to_transformer[column_name] = UnixTimestampEncoder(modality)

    synthesizer.update_transformers(column_name_to_transformer)
    time_start= time.time()
    synthesizer.fit(data)
    synthetic_data = synthesizer.sample(len(data))
    time_result = time.time() - time_start

    return synthetic_data, time_result

In [7]:
def add_nan_proportion(data, column_names, nan_proportion):
    data = data.copy()
    n_rows = len(data)
    for column_name in column_names:
        nan_indices = np.random.choice(n_rows, int(nan_proportion * n_rows), replace=False)
        data.loc[nan_indices, column_name] = np.nan

    return data

In [180]:
dataset

'census'

In [205]:
weird_columns

{0: {}}

In [209]:
weird_columns[2]

{'own business or self employed': 0         2.0
 1         2.0
 2         1.0
 3         0.0
 4         NaN
          ... 
 598565    NaN
 598566    0.0
 598567    0.0
 598568    0.0
 598569    NaN
 Name: own business or self employed, Length: 598570, dtype: float64,
 'year': 0         94.0
 1          NaN
 2         94.0
 3         94.0
 4         94.0
           ... 
 598565    94.0
 598566    95.0
 598567    94.0
 598568    94.0
 598569     NaN
 Name: year, Length: 598570, dtype: float64}

In [216]:
result

Unnamed: 0,dataset,column_name,% missinge values,% missing values before,% missing values after,KSComplement before,KSComplement after
0,census,age,9.999833,9.955060,9.955060,0.971421,0.957166
1,census,detailed industry recode,9.999833,9.844797,9.844797,0.576791,0.688491
2,census,detailed occupation recode,9.999833,9.949045,9.949045,0.653896,0.685438
3,census,wage per hour,9.999833,10.007852,10.007852,0.071699,0.057456
4,census,capital gains,9.999833,9.958401,9.958401,0.080093,0.037370
...,...,...,...,...,...,...,...
67,census,num persons worked for employer,20.000000,20.072506,20.072506,0.866605,0.884632
68,census,own business or self employed,20.000000,20.033079,20.033079,0.202122,0.861837
69,census,veterans benefits,20.000000,19.971933,19.971933,0.931826,0.704454
70,census,weeks worked in year,20.000000,20.172411,20.172411,0.683023,0.785199


In [219]:
(result['KSComplement after'] - result['KSComplement before']).max()

0.6597152873241438

In [226]:
weird_columns

{0: {'capital losses': 0            NaN
  1            NaN
  2            0.0
  3            0.0
  4            0.0
             ...  
  598565    1797.0
  598566      12.0
  598567    2653.0
  598568       0.0
  598569       0.0
  Name: capital losses, Length: 598570, dtype: float64,
  'ksc_before': 0.5000016706483786,
  'ksc_after': 0.9976590698943295,
  'nan_proportion': 0.9,
  'own business or self employed': 0        NaN
  1        NaN
  2        NaN
  3        NaN
  4        NaN
            ..
  598565   NaN
  598566   NaN
  598567   NaN
  598568   NaN
  598569   NaN
  Name: own business or self employed, Length: 598570, dtype: float64,
  'year': 0          NaN
  1          NaN
  2          NaN
  3          NaN
  4          NaN
            ... 
  598565     NaN
  598566     NaN
  598567    94.0
  598568     NaN
  598569     NaN
  Name: year, Length: 598570, dtype: float64},
 1: {'own business or self employed': 0        NaN
  1        NaN
  2        NaN
  3        NaN
  4        

In [234]:
from sdmetrics.visualization import get_column_plot
for idx in range(num_try):
    for key, value in weird_columns[idx].items():
        if key not in ['ksc_before', 'ksc_after', 'nan_proportion']:
            data_before = value[:int(0.5 * len(value))]
            data_after = value[int(0.5 * len(value)):]
            ksc_before = KSComplement.compute(
                real_data=data[key],
                synthetic_data=data_before
            )
            ksc_after = KSComplement.compute(
                real_data=data[key],
                synthetic_data=data_after
            )

            fig = get_column_plot(
                real_data=data,
                synthetic_data=data_before.to_frame(),
                column_name=key
            )
            fig.update_layout(
                title=f'Column name: {key}, mean, KSComplement: {ksc_before:.2f}',
            )

            fig.show()

            fig = get_column_plot(
                real_data=data,
                synthetic_data=data_after.to_frame(),
                column_name=key
            )
            fig.update_layout(
                title=f'Column name: {key}, random, KSComplement: {ksc_after:.2f}',
            )

            fig.show()
        

LinAlgError: The data appears to lie in a lower-dimensional subspace of the space in which it is expressed. This has resulted in a singular data covariance matrix, which cannot be treated using the algorithms implemented in `gaussian_kde`. Consider performing principle component analysis / dimensionality reduction and using `gaussian_kde` with the transformed data.

In [244]:
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata().load_from_dict({
    'columns': {
    'col_1': {'sdtype': 'state_abbr'}
}
})

data = pd.DataFrame({
    'col_1': ['MA', 'CA', 'CA']
})

synthesizer = GaussianCopulaSynthesizer(metadata, locales=['en_GB'])
synthesizer.fit(data)


AttributeError: 'Generator' object has no attribute 'state_abbr'

In [243]:
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata().load_from_dict({
    'columns': {
    'col_1': {'sdtype': 'state_abbr'}
}
})

data = pd.DataFrame({
    'col_1': ['MA', 'CA', 'CA']
})

synthesizer = GaussianCopulaSynthesizer(metadata, locales=['en_US', 'en_GB'])
synthesizer.fit(data)



Locales ['en_GB'] do not support provider 'address.en_US' and function 'state_abbr'.
In place of these locales, 'en_US' will be used instead. Please refer to the localized provider docs for more information: https://faker.readthedocs.io/en/master/locales.html



In [15]:
from sdmetrics.visualization import get_column_plot
import numpy as np
import pandas as pd
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata
from sdmetrics.visualization import get_column_plot
from sdv.datasets.demo import download_demo
from rdt.transformers import FloatFormatter, UnixTimestampEncoder
import time

np.random.seed(np.random.randint(1000))
data, metadata = download_demo('single_table', 'census')
metadata_dict = metadata.to_dict()

numerical_or_datetime_column = [
    column_name for column_name in metadata_dict['columns']
    if metadata_dict['columns'][column_name]['sdtype'] in ['numerical', 'datetime']
]

real_data = add_nan_proportion(data, numerical_or_datetime_column, 0.7)
synthetic_data_before, time_before = get_synthetic_data_and_time(metadata, real_data, 'mean')
synthetic_data_after, time_after = get_synthetic_data_and_time(metadata, real_data, 'random')


TypeError: get_column_plot() missing 1 required positional argument: 'column_name'

In [23]:
for column in numerical_or_datetime_column:
    ksc_before = KSComplement.compute(
        real_data=real_data[column],
        synthetic_data=synthetic_data_before[column]
    )
    ksc_after = KSComplement.compute(
        real_data=real_data[column],
        synthetic_data=synthetic_data_after[column]
    )
    print(f'column: {column}')
    print(f'ksc before: {ksc_before}')
    print(f'ksc after: {ksc_after}')


column: age
ksc before: 0.8173580274021713
ksc after: 0.8460232576645919
column: detailed industry recode
ksc before: 0.5334529246570077
ksc after: 0.5981621986180667
column: detailed occupation recode
ksc before: 0.7234932023198649
ksc after: 0.5878554040857864
column: wage per hour
ksc before: 0.05620029848751473
ksc after: 0.06201211584948407
column: capital gains
ksc before: 0.03738879623927993
ksc after: 0.039034538991895906
column: capital losses
ksc before: 0.27188083668250096
ksc after: 0.02780445190142078
column: dividends from stocks
ksc before: 0.10245257227681914
ksc after: 0.09803706166337767
column: num persons worked for employer
ksc before: 0.5135210389147529
ksc after: 0.6598962804350182
column: own business or self employed
ksc before: 0.4595291401516399
ksc after: 0.505249211808311
column: veterans benefits
ksc before: 0.7481876187409368
ksc after: 0.6268566555146285
column: weeks worked in year
ksc before: 0.6477863005000671
ksc after: 0.5591213638081965
column: yea

In [28]:
from sdmetrics.single_column import KSComplement

ksc_before = KSComplement.compute(
    real_data=real_data['num persons worked for employer'],
    synthetic_data=synthetic_data_before['num persons worked for employer']
)

In [29]:
ksc_after = KSComplement.compute(
    real_data=real_data['num persons worked for employer'],
    synthetic_data=synthetic_data_after['num persons worked for employer']
)

In [30]:
fig = get_column_plot(
    real_data=data,
    synthetic_data=synthetic_data_after,
    column_name='num persons worked for employer'
)
fig.update_layout(
    title=f'num persons worked for employer, random, KSComplement: {ksc_after:.2f}',
)
fig.show()

In [31]:
fig = get_column_plot(
    real_data=data,
    synthetic_data=synthetic_data_before,
    column_name='num persons worked for employer'
)
fig.update_layout(
    title=f'num persons worked for employer, mean, KSComplement: {ksc_before:.2f}',
)
fig.show()

In [222]:
#list_dataset = ['student_placements', 'KRK_v1', 'adult', 'census']
#nan_proportions = [0.1, 0.2, 0.4, 0.7, 0.9]

list_dataset = ['census']
nan_proportions = [0.1, 0.2, 0.4, 0.7, 0.9]
col_good = 'wage per hour'
num_try = 3
result_dataset = pd.DataFrame(columns=[
    'dataset', 'nan proportion', 'num column', 'num numerical+datetime columns',
    'OQS before', 'OQS after',
    'Column Shapes before', 'Column Shapes after',
    'Column Pair Trends before', 'Column Pair Trends after',
])

weird_columns = {}
result_columns = pd.DataFrame(columns=[
    'dataset', 'column_name',
    '% missinge values',
    '% missing values before', '% missing values after',
    'KSComplement before', 'KSComplement after'
])

for idx in range(num_try):
    np.random.seed(np.random.randint(1000))
    weird_columns[idx] = {}
    for dataset in list_dataset:

        data, metadata = download_demo('single_table', dataset)
        metadata_dict = metadata.to_dict()

        numerical_or_datetime_column = [
            column_name for column_name in metadata_dict['columns']
            if metadata_dict['columns'][column_name]['sdtype'] in ['numerical', 'datetime']
        ]
        for nan_proportion in nan_proportions:
            real_data = add_nan_proportion(data, numerical_or_datetime_column, nan_proportion)
            synthetic_data_before, time_before = get_synthetic_data_and_time(metadata, real_data, 'mean')
            synthetic_data_after, time_after = get_synthetic_data_and_time(metadata, real_data, 'random')
        
            report_before = QualityReport()
            report_before.generate(data, synthetic_data_before, metadata_dict, verbose=False)


            report_after = QualityReport()
            report_after.generate(data, synthetic_data_after, metadata_dict, verbose=False)

            details_before = report_before.get_details('Column Shapes')
            column_result_before = details_before.loc[details_before['Metric']=='KSComplement']
            details_after = report_after.get_details('Column Shapes')
            column_result_after = details_after.loc[details_after['Metric']=='KSComplement']

            
            for column_name in column_result_before['Column']:
                real_nan_proportion = 100 * real_data[column_name].isna().sum()/len(data)
                nan_proportion_before = 100 * synthetic_data_before[column_name].isna().sum()/len(synthetic_data_before)
                nan_proportion_after = 100 * synthetic_data_after[column_name].isna().sum()/len(synthetic_data_after)
                ksc_before = column_result_before.loc[column_result_before['Column']==column_name, 'Score'].squeeze()
                ksc_after = column_result_after.loc[column_result_after['Column']==column_name, 'Score'].squeeze()
                column_result = pd.DataFrame({
                    'dataset': dataset,
                    'column_name': column_name,
                    '% missinge values': real_nan_proportion,
                    '% missing values before': nan_proportion_before,
                    '% missing values after': nan_proportion_after,
                    'KSComplement before': ksc_before,
                    'KSComplement after': ksc_after,
                }, index=[0])

                if np.abs(ksc_before - ksc_after) > 0.4:
                    weird_columns[idx][column_name] = pd.concat(
                        [synthetic_data_before[column_name], synthetic_data_after[column_name]],
                        ignore_index=True
                    )
                    weird_columns[idx]['ksc_before'] = ksc_before
                    weird_columns[idx]['ksc_after'] = ksc_after
                    weird_columns[idx]['nan_proportion'] = nan_proportion

                    print(weird_columns[idx][column_name])

                result_columns = pd.concat([result_columns, column_result], ignore_index=True)

            dataset_result = pd.DataFrame({
                'dataset': dataset,
                'nan proportion': 100 * nan_proportion, 
                'num column': len(metadata_dict['columns']),
                'num numerical+datetime columns': len(numerical_or_datetime_column),
                'Time fit+sample before [s]': time_before,
                'Time fit+sample after [s]': time_after,
                'OQS before': report_before.get_score(),
                'OQS after': report_after.get_score(),
                'Column Shapes before': report_before.get_properties().iloc[0]['Score'],
                'Column Shapes after': report_after.get_properties().iloc[0]['Score'],
                'Column Pair Trends before': report_before.get_properties().iloc[1]['Score'],
                'Column Pair Trends after': report_after.get_properties().iloc[1]['Score'],
            }, index=[0])

            result_dataset = pd.concat([result_dataset, dataset_result], ignore_index=True)
            #fig = utils.get_column_plot(
            #    real_data=data,
            #    synthetic_data=synthetic_data_after,
            #    column_name='experience_years',
            #    metadata=metadata.to_dict(),
            #)
            #ksc_before_good = column_result_before.loc[column_result_before['Column']==col_good, 'Score'].squeeze()
            #fig.update_layout(
            #    title=f'After changes, KSComplement {ksc_before_good:.2f}',
            #)
            #fig.show()

            #fig = utils.get_column_plot(
            #    real_data=data,
            #    synthetic_data=synthetic_data_before,
            #    column_name='experience_years',
            #    metadata=metadata.to_dict(),
            #)
            #ksc_after_good = column_result_after.loc[column_result_before['Column']==col_good, 'Score'].squeeze()
            #fig.update_layout(
            #    title=f'Before changes, KSComplement {ksc_after_good:.2f}',
            #)
            #fig.show()

            result_dataset.to_csv(f'result_dataset_experiment_B_{3 + idx}.csv', index=False)
            result_columns.to_csv(f'result_columns_experiment_B._{3 + idx}.csv', index=False)


Unable to fit to a <class 'copulas.univariate.beta.BetaUnivariate'> distribution for column weeks worked in year. Using a Gaussian distribution instead.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



0            NaN
1            NaN
2            0.0
3            0.0
4            0.0
           ...  
598565    1797.0
598566      12.0
598567    2653.0
598568       0.0
598569       0.0
Name: capital losses, Length: 598570, dtype: float64
0         2.0
1         2.0
2         0.0
3         2.0
4         NaN
         ... 
598565    NaN
598566    0.0
598567    0.0
598568    0.0
598569    NaN
Name: own business or self employed, Length: 598570, dtype: float64
0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
598565    NaN
598566    NaN
598567    NaN
598568    1.0
598569    NaN
Name: own business or self employed, Length: 598570, dtype: float64
0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
598565   NaN
598566   NaN
598567   NaN
598568   NaN
598569   NaN
Name: own business or self employed, Length: 598570, dtype: float64
0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
          ... 
598565     NaN



Unable to fit to a <class 'copulas.univariate.beta.BetaUnivariate'> distribution for column weeks worked in year. Using a Gaussian distribution instead.


Unable to fit to a <class 'copulas.univariate.beta.BetaUnivariate'> distribution for column veterans benefits. Using a Gaussian distribution instead.



0         1.0
1         2.0
2         2.0
3         1.0
4         NaN
         ... 
598565    NaN
598566    0.0
598567    0.0
598568    0.0
598569    NaN
Name: own business or self employed, Length: 598570, dtype: float64
0         94.0
1          NaN
2         94.0
3         94.0
4         94.0
          ... 
598565    94.0
598566    95.0
598567    94.0
598568    94.0
598569     NaN
Name: year, Length: 598570, dtype: float64
0          NaN
1          NaN
2         94.0
3         94.0
4         94.0
          ... 
598565     NaN
598566     NaN
598567    94.0
598568    95.0
598569     NaN
Name: year, Length: 598570, dtype: float64
0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
598565    NaN
598566    NaN
598567    NaN
598568    0.0
598569    NaN
Name: own business or self employed, Length: 598570, dtype: float64
0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
598565   NaN
598566   NaN
598567   NaN
598568   NaN
598569   N


Unable to fit to a <class 'copulas.univariate.beta.BetaUnivariate'> distribution for column weeks worked in year. Using a Gaussian distribution instead.



0            0.0
1          299.0
2           24.0
3            0.0
4            0.0
           ...  
598565    1975.0
598566     399.0
598567    2714.0
598568    3384.0
598569       NaN
Name: wage per hour, Length: 598570, dtype: float64



Unable to fit to a <class 'copulas.univariate.beta.BetaUnivariate'> distribution for column veterans benefits. Using a Gaussian distribution instead.



0         2.0
1         1.0
2         0.0
3         1.0
4         NaN
         ... 
598565    NaN
598566    0.0
598567    0.0
598568    0.0
598569    NaN
Name: own business or self employed, Length: 598570, dtype: float64
0            1.0
1            NaN
2            NaN
3            0.0
4            5.0
           ...  
598565       NaN
598566       0.0
598567    1508.0
598568      21.0
598569       NaN
Name: wage per hour, Length: 598570, dtype: float64
0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
598565    NaN
598566    NaN
598567    NaN
598568    0.0
598569    NaN
Name: own business or self employed, Length: 598570, dtype: float64
0          NaN
1          NaN
2         95.0
3          NaN
4          NaN
          ... 
598565     NaN
598566     NaN
598567    94.0
598568     NaN
598569     NaN
Name: year, Length: 598570, dtype: float64
0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
598565   NaN
598566   NaN
59856

In [203]:
f'result_dataset_experiment_B_{3 + idx}.csv'

'result_dataset_experiment_B_3.csv'

In [201]:
result_dataset

Unnamed: 0,dataset,nan proportion,num column,num numerical+datetime columns,OQS before,OQS after,Column Shapes before,Column Shapes after,Column Pair Trends before,Column Pair Trends after


In [196]:
result_dataset

Unnamed: 0,dataset,nan proportion,num column,num numerical+datetime columns,OQS before,OQS after,Column Shapes before,Column Shapes after,Column Pair Trends before,Column Pair Trends after,Time fit+sample before [s],Time fit+sample after [s]
0,census,10.0,41,12,0.864581,0.826405,0.877509,0.873981,0.851653,0.778829,150.942633,148.27174


In [184]:
result_dataset.columns

Index(['dataset', 'nan proportion', 'num column',
       'num numerical+datetime columns', 'OQS before', 'OQS after',
       'Column Shapes before', 'Column Shapes after',
       'Column Pair Trends before', 'Column Pair Trends after',
       'Time fit+sample before [s]', 'Time fit+sample after [s]'],
      dtype='object')

In [171]:
synthetic_data_before

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,tsanchez@example.com,False,BASIC,21.59,08 Nov 2020,15 Nov 2020,126.35,"24570 Wilson Walks\nWest Megan, WY 54869",3582077138450885
1,bellshawn@example.com,False,BASIC,,02 Sep 2020,13 Sep 2020,92.09,Unit 2094 Box 3077\nDPO AE 02522,4142271383722418
2,iwhite@example.org,False,BASIC,,30 Aug 2020,14 Oct 2020,100.98,"96602 Carl Spur Apt. 379\nCatherineberg, NM 41348",6573028438398211
3,christophermiller@example.com,False,BASIC,40.12,11 Jul 2020,23 Aug 2020,174.10,"6897 Joseph Meadow Suite 514\nVillarrealberg, ...",30343480880655
4,dgarcia@example.org,True,DELUXE,,30 Jan 2020,22 Jan 2020,211.75,"930 Matthew Union Suite 195\nWest Cynthia, NM ...",4930915359735
...,...,...,...,...,...,...,...,...,...
210,kevinsilva@example.org,False,BASIC,3.84,23 Jun 2020,31 May 2020,259.61,"998 Williams Rapids Suite 292\nEast Thomas, SD...",4888484762590915330
211,kmiddleton@example.com,False,BASIC,41.19,03 Mar 2020,12 Apr 2020,116.44,"0569 Gomez Prairie\nEast Donaldhaven, TX 22415",3592134146745083
212,foconnor@example.net,False,BASIC,2.63,01 Jan 2021,05 Jan 2021,148.11,"5605 Martinez Rest\nMartinbury, AK 71747",345878654176497
213,crystal75@example.org,False,BASIC,24.24,09 Nov 2020,14 Dec 2020,140.43,"201 Tara Extension\nSouth Curtis, FL 29568",180072486793418


In [172]:
data

Unnamed: 0,student_id,gender,second_perc,high_perc,high_spec,degree_perc,degree_type,work_experience,experience_years,employability_perc,mba_spec,mba_perc,salary,placed,start_date,end_date,duration
0,17264,M,67.00,91.00,Commerce,58.00,Sci&Tech,False,0,55.0,Mkt&HR,58.80,27000.0,True,2020-07-23,2020-10-12,3.0
1,17265,M,79.33,78.33,Science,77.48,Sci&Tech,True,1,86.5,Mkt&Fin,66.28,20000.0,True,2020-01-11,2020-04-09,3.0
2,17266,M,65.00,68.00,Arts,64.00,Comm&Mgmt,False,0,75.0,Mkt&Fin,57.80,25000.0,True,2020-01-26,2020-07-13,6.0
3,17267,M,56.00,52.00,Science,52.00,Sci&Tech,False,0,66.0,Mkt&HR,59.43,,False,,,
4,17268,M,85.80,73.60,Commerce,73.30,Comm&Mgmt,False,0,96.8,Mkt&Fin,55.50,42500.0,True,2020-07-04,2020-09-27,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,17474,M,80.60,82.00,Commerce,77.60,Comm&Mgmt,False,0,91.0,Mkt&Fin,74.49,40000.0,True,2020-07-27,2020-10-20,3.0
211,17475,M,58.00,60.00,Science,72.00,Sci&Tech,False,0,74.0,Mkt&Fin,53.62,27500.0,True,2020-01-23,2020-08-04,6.0
212,17476,M,67.00,67.00,Commerce,73.00,Comm&Mgmt,True,1,59.0,Mkt&Fin,69.72,29500.0,True,2020-01-25,2020-08-05,6.0
213,17477,F,74.00,66.00,Commerce,58.00,Comm&Mgmt,False,0,70.0,Mkt&HR,60.23,20400.0,True,2020-01-19,2020-04-20,3.0


In [173]:
real_data

Unnamed: 0,student_id,gender,second_perc,high_perc,high_spec,degree_perc,degree_type,work_experience,experience_years,employability_perc,mba_spec,mba_perc,salary,placed,start_date,end_date,duration
0,17264,M,67.00,,Commerce,,Sci&Tech,False,0.0,55.0,Mkt&HR,58.80,27000.0,True,2020-07-23,2020-10-12,3.0
1,17265,M,79.33,,Science,77.48,Sci&Tech,True,1.0,86.5,Mkt&Fin,66.28,20000.0,True,2020-01-11,2020-04-09,3.0
2,17266,M,65.00,68.0,Arts,64.00,Comm&Mgmt,False,0.0,75.0,Mkt&Fin,57.80,25000.0,True,2020-01-26,2020-07-13,6.0
3,17267,M,56.00,52.0,Science,52.00,Sci&Tech,False,,66.0,Mkt&HR,59.43,,False,,,
4,17268,M,85.80,73.6,Commerce,73.30,Comm&Mgmt,False,0.0,96.8,Mkt&Fin,55.50,42500.0,True,2020-07-04,2020-09-27,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,17474,M,80.60,82.0,Commerce,,Comm&Mgmt,False,0.0,91.0,Mkt&Fin,74.49,40000.0,True,2020-07-27,2020-10-20,3.0
211,17475,M,58.00,60.0,Science,72.00,Sci&Tech,False,0.0,74.0,Mkt&Fin,53.62,27500.0,True,2020-01-23,2020-08-04,6.0
212,17476,M,67.00,67.0,Commerce,73.00,Comm&Mgmt,True,1.0,59.0,Mkt&Fin,69.72,29500.0,True,2020-01-25,2020-08-05,6.0
213,17477,F,74.00,66.0,Commerce,58.00,Comm&Mgmt,False,0.0,70.0,Mkt&HR,60.23,20400.0,True,2020-01-19,2020-04-20,3.0


In [1]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_data)
synthetic_data = synthesizer.sample(len(real_data))
synthetic_data.head()
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)
quality_report.get_visualization('Column Shapes')

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 9/9 [00:00<00:00, 2362.25it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 36/36 [00:00<00:00, 567.90it/s]

Overall Quality Score: 88.7%

Properties:
- Column Shapes: 89.11%
- Column Pair Trends: 88.3%


In [2]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='room_rate',
    metadata=metadata
)

fig.show()

In [3]:
synthetic_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,dsullivan@example.net,True,BASIC,2.34,2020-03-26,2020-04-11,119.53,"90469 Karla Knolls Apt. 781\nSusanberg, CA 70033",5161033759518983
1,steven59@example.org,False,DELUXE,,2020-07-02,2020-09-14,174.7,"6108 Carla Ports Apt. 116\nPort Evan, MI 71694",4133047413145475690
2,brandon15@example.net,False,BASIC,22.08,2020-03-30,2020-03-17,148.34,86709 Jeremy Manors Apt. 786\nPort Garychester...,4977328103788
3,humphreyjennifer@example.net,False,BASIC,8.18,2020-05-03,2020-05-22,177.51,"8906 Bobby Trail\nEast Sandra, NY 43986",3524946844839485
4,joshuabrown@example.net,False,SUITE,7.69,2020-01-13,2020-01-10,187.93,"732 Dennis Lane\nPort Nicholasstad, DE 49786",4446905799576890978


In [6]:
len(real_data)

500

In [18]:
from sdv.evaluation.single_table import get_column_pair_plot
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_data)
synthetic_data = synthesizer.sample(len(real_data))
synthetic_data.head()
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)
quality_report.get_visualization('Column Shapes')

fig = get_column_pair_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_names=['room_rate', 'amenities_fee'],
    metadata=metadata,
    sample_size=3
)

fig.show()

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 9/9 [00:00<00:00, 1029.75it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 36/36 [00:00<00:00, 475.62it/s]

Overall Quality Score: 88.7%

Properties:
- Column Shapes: 89.11%
- Column Pair Trends: 88.3%
     room_rate  amenities_fee
318     108.37          11.70
458     105.84          15.56
281     102.65          13.41
     room_rate  amenities_fee
163      87.89          48.12
351      84.08          25.86
143      93.01           5.34


In [21]:
fig.data[1].x

array([87.89, 84.08, 93.01])

In [22]:
len(synthetic_data)

500

In [1]:
import numpy as np
import pandas as pd
from sdmetrics.demos import load_demo
from sdmetrics.reports.multi_table import DiagnosticReport
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata.single_table import SingleTableMetadata

real_data, _,  metadata = load_demo(modality='single_table')
metadata = SingleTableMetadata().load_from_dict(metadata)

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_data)
synthetic_data = synthesizer.sample(len(real_data))

In [3]:
synthetic_data.to_csv('synthetic.csv', index=False)

In [1]:
from sdv.datasets.demo import download_demo
from sdv.single_table import GaussianCopulaSynthesizer

real_data, metadata = download_demo('single_table', 'fake_hotel_guests')

synthesizer = GaussianCopulaSynthesizer(metadata)

# Run
synthesizer.fit(real_data)
synthetic_data = synthesizer.sample(len(real_data))

In [3]:
synthetic_data.columns

Index(['guest_email', 'has_rewards', 'room_type', 'amenities_fee',
       'checkin_date', 'checkout_date', 'room_rate', 'billing_address',
       'credit_card_number'],
      dtype='object')

In [2]:
real_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,michaelsanders@shaw.net,False,BASIC,37.89,27 Dec 2020,29 Dec 2020,131.23,"49380 Rivers Street\nSpencerville, AK 68265",4075084747483975747
1,randy49@brown.biz,False,BASIC,24.37,30 Dec 2020,02 Jan 2021,114.43,"88394 Boyle Meadows\nConleyberg, TN 22063",180072822063468
2,webermelissa@neal.com,True,DELUXE,0.0,17 Sep 2020,18 Sep 2020,368.33,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380
3,gsims@terry.com,False,BASIC,,28 Dec 2020,31 Dec 2020,115.61,"77 Massachusetts Ave\nCambridge, MA 02139",4969551998845740
4,misty33@smith.biz,False,BASIC,16.45,05 Apr 2020,,122.41,"1234 Corporate Drive\nBoston, MA 02116",3558512986488983


In [17]:
synthetic_data.to_csv('synthetic.csv', index=False)

In [1]:
import rdt
rdt.__version__

'1.9.0.dev0'

In [18]:
import pandas as pd
from sdv.datasets.demo import  download_demo
from rdt.transformers import FloatFormatter, UnixTimestampEncoder
data, metadata = download_demo('single_table', 'student_placements')
metadata.visualize()

from sdv.single_table import GaussianCopulaSynthesizer

synth = GaussianCopulaSynthesizer(metadata)
synth._preprocess(data)
synth.update_transformers({
    'start_date': UnixTimestampEncoder(datetime_format='%Y-%m-%d', enforce_min_max_values=True),
    'end_date': UnixTimestampEncoder(datetime_format='%Y-%m-%d', enforce_min_max_values=True),
})
synth.fit(data)
synth_data = synth.sample(len(data))

print(pd.to_datetime(data['start_date']).min())
print(pd.to_datetime(synth_data['start_date']).min())
print(pd.to_datetime(data['start_date']).max())
print(pd.to_datetime(synth_data['start_date']).max())



NotFittedError: Error: Sampling terminated. Partial results are stored in a temporary file: .sample.csv.temp. This file will be overridden the next time you sample. Please rename the file if you wish to save these results.
The HyperTransformer is not ready to use. Please fit your data first using 'fit' or 'fit_transform'.

In [20]:
print(pd.to_datetime(data['start_date']).min())
print(pd.to_datetime(synth_data['start_date']).min())

2020-01-01 00:00:00
2020-01-01 00:00:00


In [21]:
print(pd.to_datetime(data['start_date']).max())
print(pd.to_datetime(synth_data['start_date']).max())

2020-07-28 00:00:00
2020-07-27 00:00:00


In [22]:
print(pd.to_datetime(data['end_date']).min())
print(pd.to_datetime(synth_data['end_date']).min())

2020-03-20 00:00:00
2020-03-23 00:00:00


In [23]:
print(pd.to_datetime(data['end_date']).max())
print(pd.to_datetime(synth_data['end_date']).max())

2021-01-28 00:00:00
2021-01-22 00:00:00


In [16]:
synth.get_transformers()['start_date']

UnixTimestampEncoder(datetime_format='%Y-%m-%d', enforce_min_max_values=True)

In [11]:
synth.get_transformers()['start_date'].enforce_min_max_values

False

In [1]:
from sdv.datasets.demo import download_demo, get_available_demos
get_available_demos('single_table')

Unnamed: 0,dataset_name,size_MB,num_tables
0,KRK_v1,0.07,1
1,adult,3.91,1
2,alarm,4.52,1
3,asia,1.28,1
4,census,98.17,1
5,census_extended,4.95,1
6,child,3.2,1
7,covtype,255.65,1
8,credit,68.35,1
9,expedia_hotel_logs,0.2,1


In [2]:
from faker import Faker
import inspect

fake = Faker()

dir(fake)


['__annotations__',
 '__class__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_factories',
 '_factory_map',
 '_locales',
 '_map_provider_method',
 '_optional_proxy',
 '_select_factory',
 '_select_factory_choice',
 '_select_factory_distribution',
 '_unique_proxy',
 '_weights',
 'aba',
 'add_provider',
 'address',
 'administrative_unit',
 'am_pm',
 'android_platform_token',
 'ascii_company_email',
 'ascii_email',
 'ascii_free_email',
 'ascii_safe_email',
 'bank_country',
 'basic_phone_number',
 'bban',
 'binary',
 'boolean',
 'bothify',
 'bs',
 'building_number',
 'cache_pattern',
 'catch_phrase',
 'century',
 'chr

In [11]:
# Custom predicate to filter out methods that raise TypeError
from faker import Faker
import inspect

def safe_methods(obj):
    methods = []
    for name in dir(obj):
        if name.startswith("_"):
            continue
        try:
            attribute = getattr(obj, name)
            if callable(attribute):
                methods.append(name)
        except TypeError:
            continue
    return methods

# Get safe methods from the Faker instance
instance = Faker(['fr_FR'])
faker_methods = safe_methods(instance)

len(faker_methods)


282

In [12]:
faker_methods

['aba',
 'add_provider',
 'address',
 'administrative_unit',
 'am_pm',
 'android_platform_token',
 'area_code_with_separator',
 'area_code_without_separator',
 'ascii_company_email',
 'ascii_email',
 'ascii_free_email',
 'ascii_safe_email',
 'bank_country',
 'bban',
 'binary',
 'boolean',
 'bothify',
 'bs',
 'building_number',
 'catch_phrase',
 'catch_phrase_attribute',
 'catch_phrase_noun',
 'catch_phrase_verb',
 'century',
 'chrome',
 'city',
 'city_prefix',
 'city_suffix',
 'color',
 'color_hsl',
 'color_hsv',
 'color_name',
 'color_rgb',
 'color_rgb_float',
 'company',
 'company_email',
 'company_suffix',
 'coordinate',
 'country',
 'country_calling_code',
 'country_code',
 'credit_card_expire',
 'credit_card_full',
 'credit_card_number',
 'credit_card_provider',
 'credit_card_security_code',
 'cryptocurrency',
 'cryptocurrency_code',
 'cryptocurrency_name',
 'csv',
 'currency',
 'currency_code',
 'currency_name',
 'currency_symbol',
 'current_country',
 'current_country_code',
 'd

In [8]:
pd.to_datetime(data['start_date']).max()

Timestamp('2020-07-28 00:00:00')

In [15]:
import re
from sdv.metadata import SingleTableMetadata

re.sub(r'[^a-zA-Z]', '', 'city').lower()
metadata = SingleTableMetadata()
for reference, sdtype in metadata._REFERENCE_TO_SDTYPES.items():
        if 'city' == reference or reference in 'city':
            print('la')

la


In [44]:
import pandas as pd
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'postal_code': {'sdtype': 'categorical'},
        'distance': {'sdtype': 'numerical'},
    }
})

num_rows = 10000000
list_zip_code = ['10001', '12002', '13003', '14004', '15005', '16006', '17007', '18008', '19009', '20010']
postal_codes = np.random.choice(list_zip_code, num_rows)
data = pd.DataFrame({
    'postal_code': postal_codes,
})
for ind, code in enumerate(list_zip_code):
    n_rows = len(data.loc[data['postal_code'] == code])
    data.loc[data['postal_code'] == code, 'distance'] = np.random.uniform(10**ind, 10**(ind+1), n_rows)

In [38]:
data

Unnamed: 0,postal_code,distance
0,14004,1.211819e+03
1,12002,9.430649e+01
2,19009,2.718990e+08
3,10001,7.650552e+00
4,15005,8.883725e+04
...,...,...
9995,14004,8.908310e+03
9996,14004,8.980540e+03
9997,13003,2.765473e+02
9998,12002,6.716472e+01


In [45]:
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data)
synthetic_data = synthesizer.sample(len(data))

In [40]:
data.groupby('postal_code')['distance'].mean()

postal_code
10001    5.594092e+00
12002    5.580240e+01
13003    5.372964e+02
14004    5.536669e+03
15005    5.471069e+04
16006    5.468242e+05
17007    5.353117e+06
18008    5.485537e+07
19009    5.433880e+08
20010    5.530569e+09
Name: distance, dtype: float64

In [41]:
synthetic_data.groupby('postal_code')['distance'].mean()

postal_code
10001    3.611027e+08
12002    2.099111e+08
13003    5.883963e+08
14004    1.160549e+08
15005    4.704826e+08
16006    9.880514e+08
17007    6.698060e+08
18008    1.721958e+09
19009    2.481283e+08
20010    8.171610e+08
Name: distance, dtype: float64

In [None]:
import plotly.express as px

fig = px.box(
        data,
        x='postal_code',
        y='distance',
)
fig.update_layout(
    yaxis = dict(
        type = 'log'
    )
)
fig.show()

In [53]:
class_path = 'rdt.transformers.address.RandomLocationGenerator'
module_path, _, class_name = class_path.rpartition('.')

In [54]:
module_path

'rdt.transformers.address'

In [55]:
class_name

'RandomLocationGenerator'

In [52]:
import rdt 
hasattr(rdt.transformers, 'address')

False

In [1]:
a = None

In [2]:
b = 3
b in a

TypeError: argument of type 'NoneType' is not iterable

In [1]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

data = pd.DataFrame({
    'email': [1, 2, 2],
    'numerical': [0, 1, 2],
})

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data)
synthetic_data = synthesizer.sample(len(data))

InvalidDataError: The provided data does not match the metadata:
Key column 'email' contains repeating values: [2]

In [2]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "email": {
            "sdtype": "email",
            "pii": true
        },
        "numerical": {
            "sdtype": "numerical"
        }
    },
    "primary_key": "email"
}

In [4]:
metadata

{
    "primary_key": "email",
    "columns": {
        "email": {
            "sdtype": "email",
            "pii": true
        },
        "numerical": {
            "sdtype": "numerical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

In [1]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

data = pd.DataFrame({
    'email': ['sdv@sdv.dev', 'info@datacebo.com', 'info@gmail.co.uk', None],
    'numerical': [0, 1, 2, 1],
})

metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'email': {'sdtype': 'id', 'pii': True},
        'numerical': {'sdtype': 'numerical'},
    }
})
#metadata.detect_from_dataframe(data)
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data)

InvalidMetadataError: The following errors were found in the metadata:

Invalid values '(pii)' for id column 'email'.

In [3]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "email": {
            "sdtype": "id"
        },
        "numerical": {
            "sdtype": "numerical"
        }
    }
}

In [1]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'country': {'sdtype': 'country_code'},
        'city': {'sdtype': 'city'},
    }
})

metadata.add_column_relationship('address', ['country', 'city'])
synthesizer = GaussianCopulaSynthesizer(metadata)



In [3]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "country": {
            "sdtype": "country_code"
        },
        "city": {
            "sdtype": "city"
        }
    },
    "column_relationships": [
        {
            "type": "address",
            "column_names": [
                "country",
                "city"
            ]
        }
    ]
}

In [2]:
synthesizer = GaussianCopulaSynthesizer(metadata)



In [1]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'country': {'sdtype': 'country_code'},
        'city': {'sdtype': 'city'},
    }
})
metadata.add_column_relationship('address', ['country', 'city'])

In [2]:
synthesizer = GaussianCopulaSynthesizer(metadata)



In [1]:
from sdv.multi_table import HMASynthesizer
from sdv.datasets.demo import download_demo

mt_data, mt_metadata = download_demo('multi_table', 'fake_hotels')
mt_metadata.add_column('hotels', 'lat', sdtype='latitude')
mt_metadata.add_column('hotels', 'lon', sdtype='longitude')

mt_metadata.add_column_relationship('hotels', 'gps', ['lat', 'lon'])
mt_synth = HMASynthesizer(mt_metadata)



In [7]:
mt_metadata.validate()



In [4]:
get_available_demos('single_table')

Unnamed: 0,dataset_name,size_MB,num_tables
0,KRK_v1,0.07,1
1,adult,3.91,1
2,alarm,4.52,1
3,asia,1.28,1
4,census,98.17,1
5,census_extended,4.95,1
6,child,3.2,1
7,covtype,255.65,1
8,credit,68.35,1
9,expedia_hotel_logs,0.2,1


In [5]:
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.datasets.demo import download_demo, get_available_demos

mt_data, mt_metadata = download_demo('single_table', 'fake_hotel_guests')
mt_metadata.add_column('lat', sdtype='latitude')
mt_metadata.add_column('lon', sdtype='longitude')

mt_metadata.add_column_relationship('gps', ['lat', 'lon'])
mt_synth = GaussianCopulaSynthesizer(mt_metadata)



## try version

In [2]:
import sdv

print(sdv.version.enterprise)

None


In [3]:
sdv.version.__all__

('public', 'enterprise')

In [2]:
print(sdv.version.public)

1.9.1.dev0


In [1]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

data = pd.DataFrame(data={
    'id': ['N', 'A', 'K', 'F', 'P'],
    'numerical': [1, 2, 3, 2, 1],
    'name': ['A', 'A', 'B', 'B', 'B']
})

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)

In [None]:
tests/integration/single_table/test_copulas.py

In [None]:
tests/unit/data_processing/test_data_processor.py

In [2]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "id": {
            "sdtype": "categorical"
        },
        "numerical": {
            "sdtype": "numerical"
        },
        "name": {
            "sdtype": "categorical"
        }
    }
}

In [1]:


# Run
metadata.update_column(column_name='id', sdtype='first_name')
metadata.update_column(column_name='name', sdtype='name')
metadata.set_primary_key('id')
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.auto_assign_transformers(data)

# Assert
id_transformer = synthesizer.get_transformers()['id']
name_transformer = synthesizer.get_transformers()['name']
assert id_transformer.provider_name == 'person'

In [3]:
synthesizer._data_processor._hyper_transformer.get_config()

{
    "sdtypes": {
        "numerical": "numerical",
        "name": "categorical",
        "id": "categorical"
    },
    "transformers": {
        "numerical": FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
        "name": UniformEncoder(),
        "id": UniformEncoder()
    }
}

In [4]:
metadata

{
    "columns": {
        "id": {
            "sdtype": "first_name"
        },
        "numerical": {
            "sdtype": "numerical"
        },
        "name": {
            "sdtype": "name"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "primary_key": "id"
}

In [2]:
synthesizer.get_transformers()

{'id': UniformEncoder(),
 'numerical': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'name': UniformEncoder()}

In [None]:
def _transformer_and_sdtype_assignment_id(self, data, column, column_metadata):

        is_numeric = pd.api.types.is_numeric_dtype(data[column].dtype)
        if column_metadata.get('regex_format', False):
            transformer = self.create_regex_generator(
                column,
                'id',
                column_metadata,
                is_numeric
            )
            sdtype = 'text'

        elif column in self._keys:
            prefix = None
            if not is_numeric:
                prefix = 'sdv-id-'

            transformer = IDGenerator(prefix=prefix)
            sdtype = 'text'

        else:
            transformer = AnonymizedFaker(
                provider_name=None,
                function_name='bothify',
                function_kwargs={'text': '#####'}
            )
            sdtype = 'pii'

        return transformer, sdtype

In [2]:
dp._transformers_by_sdtype

{'numerical': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'categorical': UniformEncoder(),
 'boolean': UniformEncoder(),
 'datetime': UnixTimestampEncoder(),
 'pii': AnonymizedFaker(function_name='lexify'),
 None: BaseMultiColumnTransformer(),
 'id': RegexGenerator()}

In [3]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.data_processing import DataProcessor
import numpy as np
from rdt.transformers import FloatFormatter, AnonymizedFaker, UniformEncoder

data = pd.DataFrame({
    'name_pii': ['John', 'Doe', 'Johanna'],
    'phone_pii': ['123-456-7890', '123-456-7890', '123-456-7890'],
    'city_categorical': ['New York', 'Madrid', 'New York'],
    'example_default': [1, 2, 3],
    'example_pii_true': [4, 5, 6],
    'example_pii_false': [7, 8, 9],
    'unknown_pii_true': ['a', 'b', 'c'],
    'unknown_pii_false': ['a', 'b', 'c'],
    'id_pii_true': ['ID_001', 'ID_002', 'ID_003'],
    'id_pii_false': ['ID_001', 'ID_002', 'ID_003'],
})
metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'name_pii': {'sdtype': 'name'},
        'phone_pii': {'sdtype': 'phone_number', 'pii': True},
        'city_categorical': {'sdtype': 'city', 'pii': False},
        'example_default': {'sdtype': 'example'},
        'example_pii_true': {'sdtype': 'example', 'pii': True},
        'example_pii_false': {'sdtype': 'example', 'pii': False},
        'unknown_pii_true': {'sdtype': 'unknown', 'pii': True},
        'unknown_pii_false': {'sdtype': 'unknown', 'pii': False},
        'id_pii_true': {'sdtype': 'id', 'pii': True},
        'id_pii_false': {'sdtype': 'id', 'pii': False},
    },
})
dp = DataProcessor(metadata)
dp._transformers_by_sdtype['example']= FloatFormatter()

# Run
config = dp._create_config(data, set())

# Assert
config['transformers']

{'example_default': FloatFormatter(),
 'unknown_pii_true': AnonymizedFaker(function_name='bothify', function_kwargs={'text': 'sdv-pii-?????', 'letters': '0123456789abcdefghijklmnopqrstuvwxyz'}),
 'phone_pii': AnonymizedFaker(provider_name='phone_number', function_name='phone_number'),
 'name_pii': AnonymizedFaker(provider_name='person', function_name='name'),
 'id_pii_true': AnonymizedFaker(function_name='bothify', function_kwargs={'text': '#####'}),
 'example_pii_false': FloatFormatter(),
 'unknown_pii_false': AnonymizedFaker(function_name='bothify', function_kwargs={'text': 'sdv-pii-?????', 'letters': '0123456789abcdefghijklmnopqrstuvwxyz'}),
 'id_pii_false': AnonymizedFaker(function_name='bothify', function_kwargs={'text': '#####'}),
 'example_pii_true': FloatFormatter(),
 'city_categorical': UniformEncoder()}

In [None]:
assert config['sdtypes'] == {
    'name': 'pii',
    'city_column': 'categorical',
    'phone': 'pii',
    'example_column': 'example',
    'example_column_2': 'example',
    'example_column_3': 'example',
}
expected_transformers = {
    'name': AnonymizedFaker,
    'city_column': UniformEncoder,
    'phone': AnonymizedFaker,
    'example_column': FloatFormatter,
    'example_column_2': FloatFormatter,
    'example_column_3': FloatFormatter,
}
for column, transformer in config['transformers'].items():
    assert isinstance(transformer, expected_transformers[column])

In [4]:
config['transformers']

{'example_column_2': FloatFormatter(),
 'phone': AnonymizedFaker(provider_name='phone_number', function_name='phone_number'),
 'name': AnonymizedFaker(provider_name='person', function_name='name'),
 'city_column': UniformEncoder(),
 'example_column': FloatFormatter()}

In [2]:
config['sdtypes']

{'example_column_2': 'example',
 'phone': 'pii',
 'name': 'pii',
 'city_column': 'pii',
 'example_column': 'example'}

In [3]:
metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'A': {'sdtype': 'numerical'},
        'B': {'sdtype': 'numerical'},
        'C': {'sdtype': 'numerical'},
        'D': {'sdtype': 'categorical'},
    }
}).to_dict()

metadata

{'columns': {'A': {'sdtype': 'numerical'},
  'B': {'sdtype': 'numerical'},
  'C': {'sdtype': 'numerical'},
  'D': {'sdtype': 'categorical'}},
 'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1'}

In [5]:
from sdv.datasets.demo import download_demo, get_available_demos

get_available_demos('multi_table')

Unnamed: 0,dataset_name,size_MB,num_tables
0,Accidents_v1,296.2,3
1,Atherosclerosis_v1,7.92,4
2,AustralianFootball_v1,32.53,4
3,Biodegradability_v1,0.69,5
4,Bupa_v1,0.06,9
5,CORA_v1,1.99,3
6,Carcinogenesis_v1,1.64,6
7,Chess_v1,0.4,2
8,Countries_v1,10.52,4
9,DCG_v1,0.32,2


In [12]:
data, metadata = download_demo('multi_table', 'genes_v1')

In [11]:
metadata.save_to_json('metadata_test.json')

In [None]:
{
    "synthesizer_name": "DayZSynthesizer",
    "synthesizer_parameters": {"locales": ["en_US"]}
    "metadata": {
        "columns": {
            "col 1": {"sdtype": "numerical"},
            "col 2": {"sdtype": "numerical"},
            "col 3": {"sdtype": "categorical"},
        }
    }
}

In [1]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.data_processing import DataProcessor
import numpy as np
from rdt.transformers import FloatFormatter, AnonymizedFaker, UniformEncoder
from sdv.single_table import GaussianCopulaSynthesizer
import warnings
warnings.simplefilter('always')

metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'A': {'sdtype': 'numerical'},
        'B': {'sdtype': 'numerical'},
        'C': {'sdtype': 'numerical'},
        'D': {'sdtype': 'categorical'},
    }
})
data = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9],
    'D': ['a', 'b', 'c'],
})
metadata.update_column('A', sdtype='categorical')
synthesizer = GaussianCopulaSynthesizer(metadata)
metadata.update_column('B', sdtype='categorical')
synthesizer.fit(data)




In [2]:
metadata.update_column('B', sdtype='categorical')
synthesizer.fit(data)

In [3]:
synthesizer.metadata._updated = True
synthesizer._check_metadata_updated()
synthesizer._check_metadata_updated()
synthesizer.metadata._updated = True
synthesizer._check_metadata_updated()



In [9]:
metadata.update_column('A', sdtype='categorical')


In [10]:
synthesizer._data_processor.metadata

{
    "columns": {
        "A": {
            "sdtype": "categorical"
        },
        "B": {
            "sdtype": "numerical"
        },
        "C": {
            "sdtype": "numerical"
        },
        "D": {
            "sdtype": "categorical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

In [15]:
metadata

{
    "tables": {
        "Classification": {
            "columns": {
                "GeneID": {
                    "sdtype": "id",
                    "regex_format": "[A-Za-z]{5}"
                },
                "nb_rows_in_Interactions": {
                    "sdtype": "numerical",
                    "computer_representation": "Float"
                },
                "sum(Expression_Corr)": {
                    "sdtype": "numerical",
                    "computer_representation": "Float"
                },
                "nb_rows_in_Genes": {
                    "sdtype": "numerical",
                    "computer_representation": "Int64"
                },
                "max(Expression_Corr)": {
                    "sdtype": "numerical",
                    "computer_representation": "Float"
                },
                "Localization": {
                    "sdtype": "categorical"
                },
                "min(Expression_Corr)": {
                    "s

In [1]:
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer
from sdv.datasets.demo import download_demo, get_available_demos
import warnings
warnings.simplefilter('always')

data, metadata = download_demo('multi_table', 'genes_v1')
#metadata = MultiTableMetadata()
#metadata.detect_from_dataframes(data) # this logic is not guaranteed to be accurate and may change!!

metadata.update_column(table_name="Classification", column_name="nb_rows_in_Genes", sdtype="categorical")
metadata.update_column(table_name="Genes", column_name="add_numerical", sdtype="categorical")
synthesizer = HMASynthesizer(metadata)

#synthesizer.fit(data)



In [2]:
data, metadata = download_demo('multi_table', 'got_families')

In [3]:
metadata

{
    "tables": {
        "characters": {
            "columns": {
                "character_id": {
                    "sdtype": "id",
                    "regex_format": "^[1-9]{1,2}$"
                },
                "name": {
                    "sdtype": "categorical"
                },
                "age": {
                    "sdtype": "numerical",
                    "computer_representation": "Int64"
                }
            },
            "primary_key": "character_id"
        },
        "families": {
            "columns": {
                "family_id": {
                    "sdtype": "id",
                    "regex_format": "^[1-9]$"
                },
                "name": {
                    "sdtype": "categorical"
                }
            },
            "primary_key": "family_id"
        },
        "character_families": {
            "columns": {
                "character_id": {
                    "sdtype": "id",
                    "regex_format": 

In [3]:
synthesizer = HMASynthesizer(metadata)
synthesizer.add_constraints()

TypeError: add_constraints() missing 1 required positional argument: 'constraints'

In [1]:
from tests.utils import get_multi_table_data, get_multi_table_metadata
from sdv.multi_table.base import BaseMultiTableSynthesizer
import warnings
warnings.simplefilter('always')

metadata = get_multi_table_metadata()
metadata.add_column('nesreca', 'lat', sdtype='latitude')
metadata.add_column('nesreca', 'lon', sdtype='longitude')

metadata.add_column_relationship('nesreca', 'gps', ['lat', 'lon'])

# Run
expected_warning = (
    "The metadata contains a column relationship of type 'gps'. "
    'which requires the gps add-on. This relationship will be ignored. For higher'
    ' quality data in this relationship, please inquire about the SDV Enterprise tier.'
)
BaseMultiTableSynthesizer(metadata)




<sdv.multi_table.base.BaseMultiTableSynthesizer at 0x12feb1f40>

In [1]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.data_processing import DataProcessor
from rdt.transformers import FloatFormatter

data = pd.DataFrame({
    'name_pii': ['John', 'Doe', 'Johanna'],
    'phone_pii': ['123-456-7890', '123-456-7890', '123-456-7890'],
    'city_categorical': ['New York', 'Madrid', 'New York'],
    'example_default': [1, 2, 3],
    'example_pii_true': [4, 5, 6],
    'example_pii_false': [7, 8, 9],
    'unknown_pii_true': ['a', 'b', 'c'],
    'unknown_pii_false': ['a', 'b', 'c'],
    'id_pii_true': ['ID_001', 'ID_002', 'ID_003'],
    'id_pii_false': ['ID_001', 'ID_002', 'ID_003'],
})
metadata = SingleTableMetadata().load_from_dict({
    'columns': {
        'name_pii': {'sdtype': 'name'},
        'phone_pii': {'sdtype': 'phone_number', 'pii': True},
        'city_categorical': {'sdtype': 'city', 'pii': False},
        'example_default': {'sdtype': 'example'},
        'example_pii_true': {'sdtype': 'example', 'pii': True},
        'example_pii_false': {'sdtype': 'example', 'pii': False},
        'unknown_pii_true': {'sdtype': 'unknown', 'pii': True},
        'unknown_pii_false': {'sdtype': 'unknown', 'pii': False},
        'id_pii_true': {'sdtype': 'id', 'pii': True},
        'id_pii_false': {'sdtype': 'id', 'pii': False},
    },
})
dp = DataProcessor(metadata)
dp._transformers_by_sdtype['example'] = FloatFormatter()

# Run
config = dp._create_config(data, set())

In [2]:
config['transformers']

{'example_pii_true': FloatFormatter(),
 'city_categorical': UniformEncoder(),
 'unknown_pii_false': AnonymizedFaker(function_name='bothify', function_kwargs={'text': 'sdv-pii-?????', 'letters': '0123456789abcdefghijklmnopqrstuvwxyz'}),
 'id_pii_true': AnonymizedFaker(function_name='bothify', function_kwargs={'text': '#####'}),
 'example_default': FloatFormatter(),
 'unknown_pii_true': AnonymizedFaker(function_name='bothify', function_kwargs={'text': 'sdv-pii-?????', 'letters': '0123456789abcdefghijklmnopqrstuvwxyz'}),
 'id_pii_false': AnonymizedFaker(function_name='bothify', function_kwargs={'text': '#####'}),
 'name_pii': AnonymizedFaker(provider_name='person', function_name='name'),
 'phone_pii': AnonymizedFaker(provider_name='phone_number', function_name='phone_number'),
 'example_pii_false': FloatFormatter()}

In [None]:
expected_functions = {
    'unknown_pii_false': 'bothify',
    'unknown_pii_true': 'bothify',
    'phone_pii': 'phone_number',
    'name_pii': 'person',
    'id_pii_true': 'bothify',
    'id_pii_false': 'bothify'
}

In [5]:
config['transformers']['unknown_pii_false'].function_name

'bothify'

In [None]:
from sdv._utils import drop_unknown_references
from sdv.multi_table import HMASynthesizer

cleaned_data = drop_unknown_references(metadata=my_metadata, data=original_data)

synth = HMASynthesizer(metadata)
synth.fit(cleaned_data) # now synthesizers should accept the cleaned data


In [5]:
from sdv.datasets.demo import download_demo, get_available_demos

get_available_demos('multi_table')

Unnamed: 0,dataset_name,size_MB,num_tables
0,Accidents_v1,296.2,3
1,Atherosclerosis_v1,7.92,4
2,AustralianFootball_v1,32.53,4
3,Biodegradability_v1,0.69,5
4,Bupa_v1,0.06,9
5,CORA_v1,1.99,3
6,Carcinogenesis_v1,1.64,6
7,Chess_v1,0.4,2
8,Countries_v1,10.52,4
9,DCG_v1,0.32,2


In [6]:
data, metadata = download_demo('multi_table', 'genes_v1')

In [10]:
relationship = metadata.relationships[0]

In [11]:
relationship

{'parent_table_name': 'Classification',
 'parent_primary_key': 'GeneID',
 'child_table_name': 'Genes',
 'child_foreign_key': 'GeneID'}

In [24]:
table_to_idx_to_drop = {}

In [None]:
~

In [22]:
idx_to_drop = set(data[relationship['child_table_name']][relationship['child_foreign_key']].isin(data[relationship['parent_table_name']][relationship['parent_primary_key']]).index)

In [None]:
table_to_idx_to_drop

In [1]:
import pandas as pd
from sdv.metadata import MultiTableMetadata
from sdv._utils import drop_unknown_references


parent = pd.DataFrame(data={
    'id': [0, 1, 2, 3, 4],
    'A': [True, True, False, True, False],
    'B': [0.434, 0.312, 0.212, 0.339, 0.491]
})

child = pd.DataFrame(data={
    'parent_id': [0, 1, 2, 2, 5],
    'C': ['Yes', 'No', 'Maye', 'No', 'No']
})

data = {
    'parent': parent,
    'child': child
}

metadata = MultiTableMetadata.load_from_dict({
    'tables': {
        'parent': {
            'columns': {
              'id': { 'sdtype': 'id' },
              'A': { 'sdtype': 'categorical'},
              'B': { 'sdtype': 'numerical'}
            },
            'primary_key': 'id'
        },
        'child': {
            'columns': {
                'parent_id': { 'sdtype': 'id' },
                'C': { 'sdtype': 'categorical'}
            }
        }
    },
    'relationships': [{
        'parent_table_name': 'parent',
        'child_table_name': 'child',
        'parent_primary_key': 'id',
        'child_foreign_key': 'parent_id'
    }]
})

metadata.validate()

data_2 = drop_unknown_references(metadata, data)
metadata.validate_data(data_2)

In [2]:
data_2

{'parent':    id      A      B
 0   0   True  0.434
 1   1   True  0.312
 2   2  False  0.212
 3   3   True  0.339
 4   4  False  0.491,
 'child':    parent_id     C
 0          0   Yes
 1          1    No
 2          2  Maye
 3          2    No}

In [3]:
metadata.validate_data(data)

InvalidDataError: The provided data does not match the metadata:
Relationships:
Error: foreign key column 'parent_id' contains unknown references: (5). All the values in this column must reference a primary key.

In [4]:
from sdv.multi_table import HMASynthesizer

synth = HMASynthesizer(metadata)
synth.fit(data)

InvalidDataError: The provided data does not match the metadata:
Relationships:
Error: foreign key column 'parent_id' contains unknown references: (5). All the values in this column must reference a primary key.

In [None]:
def drop_unknown_references(metadata, data, drop_missing_values=True):
    """Drop rows with unknown foreign keys.

    Args:
        metadata (MultiTableMetadata):
            Metadata of the datasets.
        data (dict):
            Dictionary that maps each table name (string) to the data for that
            table (pandas.DataFrame).
        drop_missing_values (bool):
            Boolean describing whether or not to also drop foreign keys with missing values
            If True, drop rows with missing values in the foreign keys.
            Defaults to True.

    Returns:
        dict:
            Dictionary with the dataframes ensurint referential integrity.
    """
    result = data.copy()
    table_to_idx_to_drop = {}
    relationships = deepcopy(metadata.relationships)
    while relationships:
        current_roots = _find_root_tables(relationships)
        for root in current_roots:
            relationship_idx = _get_relationship_idx_for_parent(relationships, root)
            for idx in relationship_idx:
                relationship = relationships[idx]
                parent_table = relationship['parent_table_name']
                child_table = relationship['child_table_name']
                parent_column = relationship['parent_primary_key']
                child_column = relationship['child_foreign_key']
                if child_table not in table_to_idx_to_drop:
                    table_to_idx_to_drop[child_table] = set()

                is_nan = result[child_table][child_column].isna()
                invalid_values = set(result[child_table].loc[~is_nan, child_column]) - set(
                    result[parent_table][parent_column]
                )
                invalid_rows = result[child_table][result[child_table][child_column].isin(invalid_values)]
                idx_to_drop = set(invalid_rows.index)

                if idx_to_drop:
                    table_to_idx_to_drop[child_table] = table_to_idx_to_drop[
                        child_table
                    ].union(idx_to_drop)

        relationships = deepcopy(_remove_processed_relationships(current_roots, relationships))

    for table, idx_to_drop in table_to_idx_to_drop.items():
        result[table] = result[table].drop(idx_to_drop)
        if drop_missing_values:
            result[table] = result[table].dropna(subset=[child_column])

    return result
