In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
from sdv.demo import load_tabular_demo

employees = load_tabular_demo()

In [3]:
employees

Unnamed: 0,company,department,name,address,age,age_when_joined,years_in_the_company
0,Pear,Sales,Andrew Gonzalez,Unit 6052 Box 2838\nDPO AA 79358,48,47,1
1,Pear,Design,David Riley DDS,"039 Bender Fall\nNorth Stacytown, IN 71970",35,33,2
2,Glasses,AI,Christina Holmes,"PSC 4263, Box 8772\nAPO AE 06596",31,23,8
3,Glasses,Search Engine,Jeffrey Ferrell MD,"3294 Barbara Corner\nChambersside, FL 89465",47,43,4
4,Cheerper,BigData,James Jones,"PSC 4326, Box 9319\nAPO AE 53951",30,22,8
5,Cheerper,Support,Brian Lee,"671 David Wells Suite 988\nPort Eugene, NV 62074",40,38,2
6,Pear,Sales,Vernon Hall,"90057 Miller Ferry Suite 697\nWest Shelly, NC ...",44,37,7
7,Pear,Design,Dr. Victoria Martinez,"333 Jimenez Port\nSouth Matthew, AK 87359",36,28,8
8,Glasses,AI,Sarah Wright,"71684 Travis Glens Apt. 342\nBryantchester, SC...",45,39,6
9,Glasses,Search Engine,Sandra Henry,"103 Mcbride Crescent Suite 753\nPort Brianna, ...",45,39,6


In [4]:
def years_in_the_company(data):
    return data['age'] - data['age_when_joined']

In [5]:
def age_above_30(data):
    return data['age'] > 30

In [6]:
from sdv.constraints import tabular
from sdv.tabular import GaussianCopula

constraints = [
    tabular.UniqueCombinations(columns=['company', 'department'], handling_strategy='reject_sampling'),
    tabular.GreaterThan(low='age_when_joined', high='age'),
    tabular.ColumnFormula('years_in_the_company', years_in_the_company),
    tabular.CustomConstraint(is_valid=age_above_30)
]

gc = GaussianCopula(constraints=constraints, anonymize_fields={'name': 'name', 'address': 'address'})
gc.fit(employees)

2020-07-23 22:12:06,019 - INFO - table - Loading transformer OneHotEncodingTransformer for field company
2020-07-23 22:12:06,020 - INFO - table - Loading transformer OneHotEncodingTransformer for field department
2020-07-23 22:12:06,020 - INFO - table - Loading transformer OneHotEncodingTransformer for field name
2020-07-23 22:12:06,021 - INFO - table - Loading transformer OneHotEncodingTransformer for field address
2020-07-23 22:12:06,021 - INFO - table - Loading transformer NumericalTransformer for field age_when_joined
2020-07-23 22:12:06,022 - INFO - table - Loading transformer NumericalTransformer for field age
2020-07-23 22:12:06,074 - INFO - gaussian - Fitting GaussianMultivariate()


In [7]:
constrained = gc.sample(100)

2020-07-23 22:12:16,043 - INFO - base - 29 invalid rows found. Resampling 40 rows
2020-07-23 22:12:17,933 - INFO - base - 3 invalid rows found. Resampling 6 rows


In [8]:
constrained.head()

Unnamed: 0,company,department,name,address,age,age_when_joined,years_in_the_company
0,Cheerper,BigData,Crystal Thompson,"570 Jacob Cliff Apt. 634\nRomeroborough, NC 48783",37,33,4
1,Glasses,AI,Jennifer Garcia,"9334 Erin Stravenue\nThomasville, TX 78772",46,41,5
2,Pear,Design,Lori Henry,"468 Darren Extension\nMeganmouth, DC 47073",34,31,3
4,Pear,Sales,David Sawyer,"12538 Becker Plaza Apt. 842\nGatesborough, NE ...",47,46,1
5,Glasses,AI,Lori Henry,"468 Darren Extension\nMeganmouth, DC 47073",35,27,8


In [9]:
metadata = gc.get_metadata()

In [10]:
import json

print(json.dumps(metadata.to_dict(), indent=4))

{
    "fields": {
        "company": {
            "type": "categorical"
        },
        "department": {
            "type": "categorical"
        },
        "name": {
            "type": "categorical",
            "pii": true,
            "pii_category": "name"
        },
        "address": {
            "type": "categorical",
            "pii": true,
            "pii_category": "address"
        },
        "age": {
            "type": "numerical",
            "subtype": "integer"
        },
        "age_when_joined": {
            "type": "numerical",
            "subtype": "integer"
        },
        "years_in_the_company": {
            "type": "numerical",
            "subtype": "integer"
        }
    },
    "constraints": [
        {
            "constraint": "sdv.constraints.tabular.UniqueCombinations",
            "columns": [
                "company",
                "department"
            ],
            "handling_strategy": "reject_sampling"
        },
        {
     