In [1]:
from sdv.datasets.demo import download_demo
from sdv.lite import SingleTablePreset
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.evaluation.single_table import evaluate_quality, get_column_plot, get_column_pair_plot

In [2]:
real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests')

# Data & Metadata Formats

In [3]:
# Take a look at real_data and metadata
real_data.head()

## We can pass any pandas dataframe to SDV (as long as the column is not entirely empty)

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,michaelsanders@shaw.net,False,BASIC,37.89,27 Dec 2020,29 Dec 2020,131.23,"49380 Rivers Street\nSpencerville, AK 68265",4075084747483975747
1,randy49@brown.biz,False,BASIC,24.37,30 Dec 2020,02 Jan 2021,114.43,"88394 Boyle Meadows\nConleyberg, TN 22063",180072822063468
2,webermelissa@neal.com,True,DELUXE,0.0,17 Sep 2020,18 Sep 2020,368.33,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380
3,gsims@terry.com,False,BASIC,,28 Dec 2020,31 Dec 2020,115.61,"77 Massachusetts Ave\nCambridge, MA 02139",4969551998845740
4,misty33@smith.biz,False,BASIC,16.45,05 Apr 2020,,122.41,"1234 Corporate Drive\nBoston, MA 02116",3558512986488983


In [4]:
metadata

# metadata is essentially a JSON object or python dict containing column names and SDTypes. SDTypes can be found here -https://docs.sdv.dev/sdv/reference/metadata-spec/sdtypes
# We can use Faker types as SDTypes
# SDV gives us a couple additional utility methods and attributes for metadata objects

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "primary_key": "guest_email",
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }


# Creating a Synthesizer

In [5]:
## Won't use these imports below, just to see available synthesizers

# from sdv.single_table import CTGANSynthesizer, GaussianCopulaSynthesizer, TVAESynthesizer, CopulaGANSynthesizer
# from sdv.multi_table import HMASynthesizer
# from sdv.sequential import PARSynthesizer

In [6]:
# Create a Synthesizer (we are fitting a GuassianCopulaSynthesizer, which uses classic, statistical methods to train a model and generate synthetic data.)
synthesizer = GaussianCopulaSynthesizer(metadata, 
                                        # numerical_distributions={"amenities_fee":"norm"}
                                        )
synthesizer.fit(data=real_data)

# Generating Synthetic Data

In [7]:
synthetic_data = synthesizer.sample(num_rows=500)

In [8]:
synthetic_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,dsullivan@example.net,True,BASIC,2.1,26 Mar 2020,12 Apr 2020,122.83,"90469 Karla Knolls Apt. 781\nSusanberg, NC 28401",5161033759518983
1,steven59@example.org,False,DELUXE,,03 Jul 2020,15 Sep 2020,175.3,"1080 Ashley Creek Apt. 622\nWest Amy, NM 25058",4133047413145475690
2,brandon15@example.net,False,DELUXE,22.38,30 Mar 2020,17 Mar 2020,151.45,"99923 Anderson Trace Suite 861\nNorth Haley, T...",4977328103788
3,humphreyjennifer@example.net,False,BASIC,8.44,05 May 2020,22 May 2020,169.82,"9301 John Parkways\nThomasland, OH 61350",3524946844839485
4,joshuabrown@example.net,False,SUITE,8.21,13 Jan 2020,10 Jan 2020,189.85,"126 George Tunnel\nDuranstad, MS 95176",4446905799576890978


In [9]:
# Let's see if it included any "real" emails (from our original data) in our synthetic output
[email for email in real_data["guest_email"] if email in synthetic_data["guest_email"]]

## It doesn't include any of the original emails because we marked it as PII in the metadata object

[]

# Quality Evaluation 

In [12]:
for column in real_data.columns: # Cannot plot PII columns since they are unique and meant to be different
    try:
        fig = get_column_plot(
            real_data=real_data,
            synthetic_data=synthetic_data,
            column_name=column,
            metadata=metadata
        )
            
        fig.show()
    except:
        pass

# Note the missing values that were included by the synthesizer

In [13]:
real_data["amenities_fee"].median(), synthetic_data["amenities_fee"].median()

(18.59, 9.67)

In [14]:
## Go back and reinstantiate GaussianCopulaSynthesizer with "numerical_distributions" argument line uncommented

### Quality Report

In [30]:
# Evaluating the Synthetic Data
quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata)

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [00:00<00:00, 31.15it/s]



Overall Quality Score: 50.35%

Properties:
Column Shapes: 53.58%
Column Pair Trends: 47.12%


In [31]:
quality_report.METRICS

{'Column Shapes': [sdmetrics.single_table.multi_single_column.KSComplement,
  sdmetrics.single_table.multi_single_column.TVComplement],
 'Column Pair Trends': [sdmetrics.single_table.multi_column_pairs.CorrelationSimilarity,
  sdmetrics.single_table.multi_column_pairs.ContingencySimilarity]}

In [36]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,amenities_fee,KSComplement,0.226374
1,checkin_date,KSComplement,0.522
2,checkout_date,KSComplement,0.504167
3,room_rate,KSComplement,0.824
4,has_rewards,TVComplement,0.506
5,room_type,TVComplement,0.632


In [None]:
quality_report.get_visualization(property_name='Column Shapes')

In [37]:
quality_report.get_details(property_name='Column Pair Trends')
# No ContingencySimilarities because those only apply to comparisons between 2 categorical columns, and we only have 1 (room_type)

Unnamed: 0,Column 1,Column 2,Metric,Quality Score,Real Correlation,Synthetic Correlation
0,amenities_fee,checkin_date,CorrelationSimilarity,0.515705,0.03141,1.0
1,amenities_fee,checkout_date,CorrelationSimilarity,0.511524,0.023049,1.0
2,amenities_fee,room_rate,CorrelationSimilarity,0.495574,0.008852,-1.0
3,checkin_date,checkout_date,CorrelationSimilarity,0.977441,0.999953,0.954834
4,checkin_date,room_rate,CorrelationSimilarity,0.558993,0.01782,-0.864195
5,checkout_date,room_rate,CorrelationSimilarity,0.627076,0.008203,-0.737646
6,has_rewards,room_type,ContingencySimilarity,0.468,,
7,amenities_fee,has_rewards,ContingencySimilarity,0.272,,
8,checkin_date,has_rewards,ContingencySimilarity,0.224,,
9,checkout_date,has_rewards,ContingencySimilarity,0.506,,


### For more evaluation metrics, see - https://docs.sdv.dev/sdmetrics/metrics/metrics-glossary

# Conditional Sampling

### Fixed Conditions

In [15]:
from sdv.sampling import Condition

suite_guests_with_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': True}
)

suite_guests_without_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': False}
)

In [17]:
synthetic_data = synthesizer.sample_from_conditions(
    conditions=[suite_guests_with_rewards, suite_guests_without_rewards],
    output_file_path='synthetic_simulated_scenario.csv'
)

Sampling conditions: 100%|██████████| 500/500 [00:00<00:00, 1224.87it/s]


In [18]:
synthetic_data.shape

(500, 9)

In [19]:
synthetic_data["has_rewards"].value_counts()

has_rewards
True     250
False    250
Name: count, dtype: int64

### Condition on Known Column Values

In [20]:
import pandas as pd

In [21]:
existing_column_data = pd.DataFrame(data={
    'room_type': ['SUITE', 'SUITE', 'DELUXE', 'BASIC', 'BASIC'],
    'has_rewards': [True, True, True, False, False]
})

In [22]:
existing_column_data.head()

Unnamed: 0,room_type,has_rewards
0,SUITE,True
1,SUITE,True
2,DELUXE,True
3,BASIC,False
4,BASIC,False


In [23]:
synthetic_data = synthesizer.sample_remaining_columns(
    known_columns=existing_column_data,
    max_tries_per_batch=500
)

Sampling remaining columns: 100%|██████████| 5/5 [00:00<00:00, 24.52it/s]


In [24]:
synthetic_data

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,christophermiller@example.com,True,SUITE,0.0,03 Feb 2020,28 Feb 2020,255.17,"6897 Joseph Meadow Suite 514\nVillarrealberg, ...",30343480880655
1,dgarcia@example.org,True,SUITE,,07 May 2020,20 May 2020,159.66,"930 Matthew Union Suite 195\nWest Cynthia, NM ...",4930915359735
2,iwhite@example.org,True,DELUXE,,24 Jun 2020,11 Jul 2020,109.88,"96602 Carl Spur Apt. 379\nCatherineberg, NM 41348",6573028438398211
3,tsanchez@example.com,False,BASIC,9.62,13 May 2020,19 Apr 2020,112.56,"24570 Wilson Walks\nWest Megan, WY 54869",3582077138450885
4,bellshawn@example.com,False,BASIC,,22 Jul 2020,20 Jul 2020,134.78,Unit 2094 Box 3077\nDPO AE 02522,4142271383722418


# Saving & Loading

In [25]:
save_path = "basic_synthesizer.pkl"
synthesizer.save(save_path)

In [26]:
loaded_synthesizer = SingleTablePreset.load(save_path)

In [27]:
loaded_synthesizer.sample(2)

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,zrivera@example.net,False,DELUXE,7.12,17 Jun 2020,18 May 2020,167.53,"4421 Terry Inlet\nSouth Leeborough, ND 01324",4939771093271108
1,david46@example.com,False,BASIC,34.26,18 Oct 2020,,125.53,"7166 Blackburn Extension\nSouth Dennisberg, PA...",3515171266282087
