In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import numpy as np
from faker import Faker

age = np.random.randint(30, 50, 12)
age_when_joined = age - np.random.randint(0, 10, 12)

employees = pd.DataFrame({
    'company': ['A', 'A', 'B', 'B', 'C', 'C'] * 2,
    'department': ['a', 'b', 'c', 'd', 'e', 'f'] * 2,
    'age': age,
    'age_when_joined': age_when_joined,
    'years_in_the_company': age - age_when_joined
})

In [3]:
employees

Unnamed: 0,company,department,age,age_when_joined,years_in_the_company
0,A,a,41,37,4
1,A,b,41,34,7
2,B,c,45,38,7
3,B,d,49,42,7
4,C,e,48,43,5
5,C,f,49,46,3
6,A,a,48,43,5
7,A,b,40,36,4
8,B,c,37,32,5
9,B,d,49,41,8


In [4]:
company_dept = employees[['company', 'department']].drop_duplicates().values.tolist()

In [5]:
company_dept = pd.Series(company_dept).astype(str)

In [6]:
pd.Series(
    employees[['company', 'department']].drop_duplicates().values.tolist()
).astype(str).isin(company_dept).all()

True

In [7]:
def validate(data):
    print('Unique company+dept', pd.Series(
        data[['company', 'department']].drop_duplicates().values.tolist()
    ).astype(str).isin(company_dept).all())
    print('age >= age_when_joined', (data['age'] >= data['age_when_joined']).all())
    print(
        'age - age_when_joined == years',
        (data['age'] - data['age_when_joined']).equals(data['years_in_the_company'])
    )

In [8]:
validate(employees)

Unique company+dept True
age >= age_when_joined True
age - age_when_joined == years True


In [9]:
from sdv.tabular import GaussianCopula

gc = GaussianCopula()
gc.fit(employees)

2020-07-22 22:29:23,498 - INFO - table - Loading transformer OneHotEncodingTransformer for field company
2020-07-22 22:29:23,499 - INFO - table - Loading transformer OneHotEncodingTransformer for field department
2020-07-22 22:29:23,499 - INFO - table - Loading transformer NumericalTransformer for field age
2020-07-22 22:29:23,500 - INFO - table - Loading transformer NumericalTransformer for field age_when_joined
2020-07-22 22:29:23,500 - INFO - table - Loading transformer NumericalTransformer for field years_in_the_company
2020-07-22 22:29:23,514 - INFO - gaussian - Fitting GaussianMultivariate()


In [10]:
sampled = gc.sample(100)

In [11]:
validate(sampled)

Unique company+dept False
age >= age_when_joined False
age - age_when_joined == years False


In [12]:
def years_in_the_company(data):
    return data['age'] - data['age_when_joined']

In [18]:
def age_above_30(data):
    return data[data['age'] > 30]

In [19]:
from sdv.constraints import transformation, validation

constraints = [
    transformation.UniqueCombinationsConstraint(columns=['company', 'department']),
    transformation.GreaterThanConstraint(low='age_when_joined', high='age'),
    transformation.ColumnFormulaConstraint('years_in_the_company', years_in_the_company),
    validation.ValidationConstraint(age_above_30)
]
gc = GaussianCopula(constraints=constraints)
gc.fit(employees)

2020-07-22 22:30:00,106 - INFO - table - Loading transformer NumericalTransformer for field age_when_joined
2020-07-22 22:30:00,106 - INFO - table - Loading transformer OneHotEncodingTransformer for field company#department
2020-07-22 22:30:00,107 - INFO - table - Loading transformer NumericalTransformer for field age
2020-07-22 22:30:00,123 - INFO - gaussian - Fitting GaussianMultivariate()


In [20]:
constrained = gc.sample(1000)

2020-07-22 22:30:10,459 - INFO - base - 17 invalid rows found. Resampling 17 rows


In [21]:
constrained.head()

Unnamed: 0,company,department,age,age_when_joined,years_in_the_company
0,B,c,41,38,3
1,A,a,33,28,5
2,A,a,51,45,6
3,A,b,48,44,4
4,B,d,51,44,7


In [22]:
validate(constrained)

Unique company+dept True
age >= age_when_joined True
age - age_when_joined == years True
