## SEE LINK TO UNDERSTAND HOW CUSTOM CONSTRAINTS ARE APPLIED
https://docs.sdv.dev/sdv/reference/constraint-logic/custom-logic

https://1485348715-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FfNxEeZzl9uFiJ4Zf4BRZ%2Fuploads%2F0D8hru94DAfjNURndiFe%2FScreen%20Shot%202022-06-09%20at%2011.53.52%20AM.png?alt=media&token=6ce57aa9-2afb-4631-815d-55936d27e3df


In [1]:
from sdv.datasets.demo import download_demo

In [2]:
real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

This dataset contains information about various guests staying at a hotel. There is one, complex rule: Rewards members don't pay an amenities fee. That is, if has_rewards=True, then amenities_fee=0.

In [3]:
real_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,michaelsanders@shaw.net,False,BASIC,37.89,27 Dec 2020,29 Dec 2020,131.23,"49380 Rivers Street\nSpencerville, AK 68265",4075084747483975747
1,randy49@brown.biz,False,BASIC,24.37,30 Dec 2020,02 Jan 2021,114.43,"88394 Boyle Meadows\nConleyberg, TN 22063",180072822063468
2,webermelissa@neal.com,True,DELUXE,0.0,17 Sep 2020,18 Sep 2020,368.33,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380
3,gsims@terry.com,False,BASIC,,28 Dec 2020,31 Dec 2020,115.61,"77 Massachusetts Ave\nCambridge, MA 02139",4969551998845740
4,misty33@smith.biz,False,BASIC,16.45,05 Apr 2020,,122.41,"1234 Corporate Drive\nBoston, MA 02116",3558512986488983


In [4]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "primary_key": "guest_email",
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }


## Validity Checks

In [21]:
def is_valid(column_names, data):
  # let's assume the first column name is the boolean column (has_rewards)
  # and the second column is the numerical column (amenities_fee)
  boolean_column = column_names[0]
  numerical_column = column_names[1]

  # if the first column is True, the second must be 0
  true_values = (data[boolean_column] == True) & (data[numerical_column] == 0.0)
  
  # if the first is False, then the second can be anything
  false_values = (data[boolean_column] == False)

  return (true_values) | (false_values)

The SDV expects that the all rows of your real data are valid. That is, calling is_valid on your real data should return a Series of only True values.

## Transformations

The transformations must return the full datasets with specific columns transformed. We can modify, delete or add columns as long as we can reverse the transformation later.

For our function, we'll remove the 0 value whenever the boolean is True. This will allow the machine learning to learn the numerical distribution without these extra 0s.

In [22]:
def transform(column_names, data):
  # let's assume the first column name is the boolean column (has_rewards)
  # and the second column is the numerical column (amenities_fee)
  boolean_column = column_names[0]
  numerical_column = column_names[1]

  # let's replace the 0 values with a typical value (median)
  typical_value = data[numerical_column].median()
  data[numerical_column] = data[numerical_column].mask(data[boolean_column] == True, typical_value)
  
  return data

def reverse_transform(column_names, data):
  # let's assume the first column name is the boolean column (has_rewards)
  # and the second column is the numerical column (amenities_fee)
  boolean_column = column_names[0]
  numerical_column = column_names[1]

  # set the numerical column to 0 if the boolean is True
  data[numerical_column] = data[numerical_column].mask(data[boolean_column] == True, 0.0)
  
  return data

Finally, we can create our custom class by supplying these functions into the create_custom_constraint factory method. Since our constraint is similar to FixedIncrements, let's call it FixedIncrementsWithExclusion.

In [23]:
%%writefile ./custom_constraint_example.py

from sdv.constraints import create_custom_constraint_class

## COPIED FROM ABOVE
def is_valid(column_names, data):
  # let's assume the first column name is the boolean column (has_rewards)
  # and the second column is the numerical column (amenities_fee)
  boolean_column = column_names[0]
  numerical_column = column_names[1]

  # if the first column is True, the second must be 0
  true_values = (data[boolean_column] == True) & (data[numerical_column] == 0.0)
  
  # if the first is False, then the second can be anything
  false_values = (data[boolean_column] == False)

  return (true_values) | (false_values)

def transform(column_names, data):
  # let's assume the first column name is the boolean column (has_rewards)
  # and the second column is the numerical column (amenities_fee)
  boolean_column = column_names[0]
  numerical_column = column_names[1]

  # let's replace the 0 values with a typical value (median)
  typical_value = data[numerical_column].median()
  data[numerical_column] = data[numerical_column].mask(data[boolean_column] == True, typical_value)
  
  return data

def reverse_transform(column_names, data):
  # let's assume the first column name is the boolean column (has_rewards)
  # and the second column is the numerical column (amenities_fee)
  boolean_column = column_names[0]
  numerical_column = column_names[1]

  # set the numerical column to 0 if the boolean is True
  data[numerical_column] = data[numerical_column].mask(data[boolean_column] == True, 0.0)
  
  return data

IfTrueThenZero = create_custom_constraint_class(
    is_valid_fn=is_valid,
    transform_fn=transform,
    reverse_transform_fn=reverse_transform
)

Overwriting ./custom_constraint_example.py


In [24]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)

In [28]:
# load the constraint from the file
synthesizer.load_custom_constraint_classes(
    filepath='custom_constraint_example.py',
    class_names=['IfTrueThenZero']
)

# create constraints using the class

# if has_rewards=True, the amenities_fee=0
rewards_member_no_fee = {
    'constraint_class': 'IfTrueThenZero',
    'constraint_parameters': {
        'column_names': ['has_rewards', 'amenities_fee'],
    }
}

# apply the constraints to the synthesizer
synthesizer.add_constraints([
    rewards_member_no_fee
])

# now we can fit the model and create synthetic data
synthesizer.fit(real_data)
synthetic_data = synthesizer.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 1264.88it/s]


In [33]:
synthetic_data[synthetic_data["has_rewards"] == True]

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
10,maykimberly@example.com,True,BASIC,0.0,04 Sep 2020,07 Aug 2020,289.14,"51118 Michael Spur Apt. 742\nTamarashire, DE 9...",3592111594414982
23,wendyjohnson@example.org,True,BASIC,0.0,29 Mar 2020,05 Mar 2020,104.52,"422 Carrie Skyway\nSouth Amanda, WI 59540",3569823991576069
28,annesherman@example.org,True,BASIC,0.0,22 Jun 2020,20 Jun 2020,120.61,"256 Melissa Mount Suite 854\nBarnesborough, TN...",345473988199462
33,ariana98@example.org,True,BASIC,0.0,01 Jan 2021,30 Dec 2020,110.0,"2166 Whitney Knoll Apt. 395\nEast Brian, ME 77217",3506550214739424
34,hillcourtney@example.com,True,BASIC,0.0,16 Feb 2020,05 Feb 2020,168.44,"79675 Bridges Cove Apt. 006\nJoneschester, AK ...",3584123918086024
43,latoya51@example.org,True,DELUXE,0.0,24 Apr 2020,19 May 2020,198.75,"722 Kristine Dam Apt. 241\nEast Josephport, IN...",340691490658064
54,vanessaacevedo@example.org,True,BASIC,0.0,21 Dec 2020,26 Dec 2020,122.81,"0890 Morrow Ford\nEast Michele, VT 85148",345637248311683
58,hayesjeffery@example.net,True,BASIC,0.0,09 Nov 2020,10 Nov 2020,203.39,"81818 Smith Crest\nEast Aaron, IL 69477",3524466883698569
71,lindsey88@example.com,True,BASIC,0.0,11 Feb 2020,28 Jan 2020,132.48,"27639 Steven Greens\nHeatherfort, HI 72118",3566515327829948
72,jmcconnell@example.com,True,DELUXE,0.0,06 Jan 2020,08 Jan 2020,85.29,"716 Jodi Heights\nWilliamshaven, SC 51945",4227426144617853509
