In [1]:
from sdv.datasets.demo import download_demo
from sdv.lite import SingleTablePreset
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.evaluation.single_table import evaluate_quality, get_column_plot, get_column_pair_plot

In [2]:
real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests')

# Data & Metadata Formats

In [3]:
# Take a look at real_data and metadata
real_data.head()

## We can pass any pandas dataframe to SDV (as long as the column is not entirely empty)

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,michaelsanders@shaw.net,False,BASIC,37.89,27 Dec 2020,29 Dec 2020,131.23,"49380 Rivers Street\nSpencerville, AK 68265",4075084747483975747
1,randy49@brown.biz,False,BASIC,24.37,30 Dec 2020,02 Jan 2021,114.43,"88394 Boyle Meadows\nConleyberg, TN 22063",180072822063468
2,webermelissa@neal.com,True,DELUXE,0.0,17 Sep 2020,18 Sep 2020,368.33,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380
3,gsims@terry.com,False,BASIC,,28 Dec 2020,31 Dec 2020,115.61,"77 Massachusetts Ave\nCambridge, MA 02139",4969551998845740
4,misty33@smith.biz,False,BASIC,16.45,05 Apr 2020,,122.41,"1234 Corporate Drive\nBoston, MA 02116",3558512986488983


In [4]:
metadata

# metadata is essentially a JSON object or python dict containing column names and SDTypes. SDTypes can be found here -https://docs.sdv.dev/sdv/reference/metadata-spec/sdtypes
# We can use Faker types as SDTypes
# SDV gives us a couple additional utility methods and attributes for metadata objects

{
    "primary_key": "guest_email",
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABL

# Creating a Synthesizer

In [5]:
## Won't use these imports below, just to see available synthesizers

# from sdv.single_table import CTGANSynthesizer, GaussianCopulaSynthesizer, TVAESynthesizer, CopulaGANSynthesizer
# from sdv.multi_table import HMASynthesizer
# from sdv.sequential import PARSynthesizer

In [11]:
# Create a Synthesizer (we are fitting a GuassianCopulaSynthesizer, which uses classic, statistical methods to train a model and generate synthetic data.)
synthesizer = GaussianCopulaSynthesizer(metadata, 
                                        numerical_distributions={"amenities_fee":"norm"}
                                        )
synthesizer.fit(data=real_data)

# Generating Synthetic Data

In [12]:
synthetic_data = synthesizer.sample(num_rows=500)

In [13]:
synthetic_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,dsullivan@example.net,False,DELUXE,9.94,12 Mar 2020,30 Mar 2020,114.22,"90469 Karla Knolls Apt. 781\nSusanberg, NC 28401",5161033759518983
1,steven59@example.org,False,DELUXE,,21 Jun 2020,04 Sep 2020,176.24,"1080 Ashley Creek Apt. 622\nWest Amy, NM 25058",4133047413145475690
2,brandon15@example.net,False,BASIC,23.12,31 Mar 2020,20 Mar 2020,125.61,"99923 Anderson Trace Suite 861\nNorth Haley, T...",4977328103788
3,humphreyjennifer@example.net,False,BASIC,23.4,13 May 2020,27 May 2020,200.98,"9301 John Parkways\nThomasland, OH 61350",3524946844839485
4,joshuabrown@example.net,False,DELUXE,20.29,11 Jan 2020,09 Jan 2020,178.67,"126 George Tunnel\nDuranstad, MS 95176",4446905799576890978


In [14]:
# Let's see if it included any "real" emails (from our original data) in our synthetic output
[email for email in real_data["guest_email"] if email in synthetic_data["guest_email"]]

## It doesn't include any of the original emails because we marked it as PII in the metadata object

[]

# Quality Evaluation 

In [15]:
for column in real_data.columns: # Cannot plot PII columns since they are unique and meant to be different
    try:
        fig = get_column_plot(
            real_data=real_data,
            synthetic_data=synthetic_data,
            column_name=column,
            metadata=metadata
        )
            
        fig.show()
    except:
        pass

# Note the missing values that were included by the synthesizer

In [16]:
real_data["amenities_fee"].median(), synthetic_data["amenities_fee"].median()

(18.59, 18.07)

In [17]:
## Go back and reinstantiate GaussianCopulaSynthesizer with "numerical_distributions" argument line uncommented

### Quality Report

In [18]:
# Evaluating the Synthetic Data
quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata)

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 9/9 [00:00<00:00, 1830.60it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 36/36 [00:00<00:00, 295.30it/s]

Overall Quality Score: 92.45%

Properties:
- Column Shapes: 93.13%
- Column Pair Trends: 91.77%


In [21]:
quality_report.get_properties()

Unnamed: 0,Property,Score
0,Column Shapes,0.931287
1,Column Pair Trends,0.917693


In [22]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Score
0,has_rewards,TVComplement,0.992
1,room_type,TVComplement,0.97
2,amenities_fee,KSComplement,0.926807
3,checkin_date,KSComplement,0.964
4,checkout_date,KSComplement,0.972917
5,room_rate,KSComplement,0.762


In [23]:
quality_report.get_visualization(property_name='Column Shapes')

In [24]:
quality_report.get_details(property_name='Column Pair Trends')
# No ContingencySimilarities because those only apply to comparisons between 2 categorical columns, and we only have 1 (room_type)

Unnamed: 0,Column 1,Column 2,Metric,Score,Real Correlation,Synthetic Correlation
0,has_rewards,room_type,ContingencySimilarity,0.962,,
1,has_rewards,amenities_fee,ContingencySimilarity,0.81,,
2,has_rewards,checkin_date,ContingencySimilarity,0.914,,
3,has_rewards,checkout_date,ContingencySimilarity,0.918,,
4,has_rewards,room_rate,ContingencySimilarity,0.876,,
5,room_type,amenities_fee,ContingencySimilarity,0.852,,
6,room_type,checkin_date,ContingencySimilarity,0.876,,
7,room_type,checkout_date,ContingencySimilarity,0.866,,
8,room_type,room_rate,ContingencySimilarity,0.752,,
9,amenities_fee,checkin_date,CorrelationSimilarity,0.991508,0.03141,0.048393


### For more evaluation metrics, see - https://docs.sdv.dev/sdmetrics/metrics/metrics-glossary

# Conditional Sampling

### Fixed Conditions

In [25]:
from sdv.sampling import Condition

suite_guests_with_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': True}
)

suite_guests_without_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': False}
)

In [27]:
synthetic_data = synthesizer.sample_from_conditions(
    conditions=[suite_guests_with_rewards, suite_guests_without_rewards],
    output_file_path='synthetic_simulated_scenario.csv'
)

Sampling conditions: 100%|██████████| 500/500 [00:00<00:00, 2399.34it/s]


In [28]:
synthetic_data.shape

(500, 9)

In [29]:
synthetic_data["has_rewards"].value_counts()

has_rewards
True     250
False    250
Name: count, dtype: int64

### Condition on Known Column Values

In [30]:
import pandas as pd

In [31]:
existing_column_data = pd.DataFrame(data={
    'room_type': ['SUITE', 'SUITE', 'DELUXE', 'BASIC', 'BASIC'],
    'has_rewards': [True, True, True, False, False]
})

In [32]:
existing_column_data.head()

Unnamed: 0,room_type,has_rewards
0,SUITE,True
1,SUITE,True
2,DELUXE,True
3,BASIC,False
4,BASIC,False


In [33]:
synthetic_data = synthesizer.sample_remaining_columns(
    known_columns=existing_column_data,
    max_tries_per_batch=500
)

Sampling remaining columns: 100%|██████████| 5/5 [00:00<00:00, 46.02it/s]


In [34]:
synthetic_data

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,christophermiller@example.com,True,SUITE,2.39,02 Feb 2020,27 Feb 2020,306.56,"6897 Joseph Meadow Suite 514\nVillarrealberg, ...",30343480880655
1,dgarcia@example.org,True,SUITE,,13 May 2020,27 May 2020,110.88,"930 Matthew Union Suite 195\nWest Cynthia, NM ...",4930915359735
2,iwhite@example.org,True,DELUXE,,20 Jun 2020,07 Jul 2020,154.81,"96602 Carl Spur Apt. 379\nCatherineberg, NM 41348",6573028438398211
3,tsanchez@example.com,False,BASIC,16.61,15 May 2020,21 Apr 2020,100.44,"24570 Wilson Walks\nWest Megan, WY 54869",3582077138450885
4,bellshawn@example.com,False,BASIC,,23 Jul 2020,22 Jul 2020,113.52,Unit 2094 Box 3077\nDPO AE 02522,4142271383722418


# Saving & Loading

In [35]:
save_path = "basic_synthesizer.pkl"
synthesizer.save(save_path)

In [36]:
loaded_synthesizer = SingleTablePreset.load(save_path)

In [37]:
loaded_synthesizer.sample(2)

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,zrivera@example.net,False,BASIC,17.88,04 Jun 2020,04 May 2020,190.33,"4421 Terry Inlet\nSouth Leeborough, ND 01324",4939771093271108
1,david46@example.com,False,BASIC,29.28,11 Nov 2020,,95.56,"7166 Blackburn Extension\nSouth Dennisberg, PA...",3515171266282087
