In [1]:
import pandas as pd
import numpy as np

data = pd.DataFrame({
    'integer': [1, None, 1, 2, 1, 2, 3, 2],
    'float': [0.1, None, 0.1, 0.2, 0.1, 0.2, 0.3, 0.1],
    'categorical': ['a', 'b', 'a', 'b', 'a', None, 'c', None],
    'bool': [False, True, False, True, False, False, False, None],
    'nullable': [1, None, 3, None, 5, None, 7, None],
    'datetime': [
        '2010-01-01', '2010-02-01', '2010-01-01', '2010-02-01',
        '2010-01-01', '2010-02-01', '2010-03-01', None
    ]
})

In [2]:
data

Unnamed: 0,integer,float,categorical,bool,nullable,datetime
0,1.0,0.1,a,False,1.0,2010-01-01
1,,,b,True,,2010-02-01
2,1.0,0.1,a,False,3.0,2010-01-01
3,2.0,0.2,b,True,,2010-02-01
4,1.0,0.1,a,False,5.0,2010-01-01
5,2.0,0.2,,False,,2010-02-01
6,3.0,0.3,c,False,7.0,2010-03-01
7,2.0,0.1,,,,


In [3]:
data.to_csv('data.csv', index=False)

In [4]:
import json

metadata = {
    "path": "",
    "tables": [
        {
            "fields": [
                {
                    "name": "integer",
                    "type": "number",
                    "subtype": "integer",
                },
                {
                    "name": "float",
                    "type": "number",
                    "subtype": "float",
                },
                {
                    "name": "categorical",
                    "type": "categorical",
                    "subtype": "categorical",
                    "pii": False,
                    "pii_category": "email"
                },
                {
                    "name": "bool",
                    "type": "categorical",
                    "subtype": "bool",
                },
                {
                    "name": "nullable",
                    "type": "number",
                    "subtype": "float",
                },
                {
                    "name": "datetime",
                    "type": "datetime",
                    "format": "%Y-%m-%d"
                },
            ],
            "headers": True,
            "name": "data",
            "path": "data.csv",
            "use": True
        }
    ]
}

with open('data.meta.json', 'w') as f:
    json.dump(metadata, f, indent=4)

In [5]:
from sdv import SDV

sdv = SDV()
sdv.fit(metadata)

2019-10-08 15:04:18,669 - INFO - modeler - Modeling data
2019-10-08 15:04:18,670 - INFO - metadata - Loading table data
2019-10-08 15:04:18,715 - INFO - modeler - Modeling Complete


In [6]:
samples = sdv.sample_all()
samples['data']

Unnamed: 0,integer,float,categorical,bool,nullable,datetime
0,2.0,0.148526,,,,NaT
1,3.0,0.169611,c,,,NaT
2,2.0,0.183588,b,True,,2010-01-24 18:22:35.496082176
3,,,a,,,NaT
4,2.0,0.211784,b,True,6.055091,2010-02-03 07:10:02.057572352


In [7]:
from sdv.evaluation import evaluate

real = sdv.metadata.get_tables()

samples = sdv.sample_all(len(real['data']))

evaluate(real, samples).astype(str)

2019-10-08 15:04:24,046 - INFO - metadata - Loading table data


mse         0.06147794893540761
rmse        0.24794747212949683
r2_score     0.8621624152418914
dtype: object

In [8]:
from sdv import SDV

sdv = SDV()
sdv.fit('../tests/data/meta.json')

2019-10-08 15:04:27,649 - INFO - modeler - Modeling DEMO_ORDER_ITEMS
2019-10-08 15:04:27,650 - INFO - metadata - Loading table DEMO_ORDER_ITEMS
2019-10-08 15:04:27,659 - INFO - modeler - Modeling DEMO_ORDERS
2019-10-08 15:04:27,660 - INFO - metadata - Loading table DEMO_ORDERS
2019-10-08 15:04:27,667 - INFO - metadata - Loading table DEMO_ORDER_ITEMS
2019-10-08 15:04:27,670 - INFO - metadata - Loading table DEMO_ORDER_ITEMS
2019-10-08 15:04:27,756 - INFO - modeler - Modeling DEMO_CUSTOMERS
2019-10-08 15:04:27,756 - INFO - metadata - Loading table DEMO_CUSTOMERS
2019-10-08 15:04:27,766 - INFO - metadata - Loading table DEMO_ORDERS
2019-10-08 15:04:28,335 - INFO - modeler - Modeling Complete


In [16]:
from sdv.evaluation import evaluate

real = sdv.metadata.get_tables()

samples = sdv.sample_all(len(real['DEMO_CUSTOMERS']), reset_primary_keys=True)

evaluate(real, samples).astype(str)

2019-10-08 15:04:38,128 - INFO - metadata - Loading table DEMO_CUSTOMERS
2019-10-08 15:04:38,132 - INFO - metadata - Loading table DEMO_ORDERS
2019-10-08 15:04:38,136 - INFO - metadata - Loading table DEMO_ORDER_ITEMS


mse         1.0523891803778435e+50
rmse        1.0258602148333094e+25
r2_score        0.9999999999963233
dtype: object

In [17]:
samples['DEMO_CUSTOMERS'].describe()

Unnamed: 0,CUSTOMER_ID,PHONE_NUMBER1,CREDIT_LIMIT
count,7.0,7.0,7.0
mean,3.0,7356161000.0,1110.0
std,2.160247,1267037000.0,331.907618
min,0.0,5206541000.0,511.0
25%,1.5,6559874000.0,1013.5
50%,3.0,7892013000.0,1113.0
75%,4.5,8356601000.0,1265.5
max,6.0,8561623000.0,1588.0


In [18]:
real['DEMO_CUSTOMERS'].describe()

Unnamed: 0,CUSTOMER_ID,PHONE_NUMBER1,CREDIT_LIMIT
count,7.0,7.0,7.0
mean,22198560.0,6464124000.0,1071.428571
std,39077740.0,1374656000.0,449.867705
min,4.0,4045553000.0,500.0
25%,299081.0,6175553000.0,1000.0
50%,630407.0,6175553000.0,1000.0
75%,28411250.0,7035552000.0,1000.0
max,97338810.0,8605552000.0,2000.0


In [19]:
samples['DEMO_ORDERS'].describe()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,ORDER_TOTAL
count,19.0,19.0,19.0
mean,9.0,2.947368,1774.894737
std,5.627314,1.899523,1530.278584
min,0.0,0.0,41.0
25%,4.5,1.5,984.5
50%,9.0,3.0,1285.0
75%,13.5,4.5,2146.5
max,18.0,6.0,6539.0


In [20]:
real['DEMO_ORDERS'].describe()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,ORDER_TOTAL
count,10.0,10.0,10.0
mean,5.5,40400890.0,1397.5
std,3.02765,45182780.0,722.322835
min,1.0,4.0,730.0
25%,3.25,50.0,733.25
50%,5.5,27998100.0,1223.0
75%,7.75,87003140.0,2125.0
max,10.0,97338810.0,2380.0


In [21]:
samples['DEMO_ORDER_ITEMS']

Unnamed: 0,ORDER_ITEM_ID,ORDER_ID,PRODUCT_ID,UNIT_PRICE,QUANTITY
0,0,0.0,5.0,54.0,3.0
1,1,0.0,3.0,66.0,5.0
2,2,0.0,4.0,58.0,4.0
3,3,0.0,5.0,54.0,1.0
4,4,0.0,3.0,73.0,5.0
5,5,1.0,7.0,105.0,5.0
6,6,1.0,10.0,39.0,2.0
7,7,1.0,16.0,28.0,0.0
8,8,1.0,2.0,119.0,4.0
9,9,1.0,9.0,61.0,1.0


In [22]:
real['DEMO_ORDER_ITEMS']

Unnamed: 0,ORDER_ITEM_ID,ORDER_ID,PRODUCT_ID,UNIT_PRICE,QUANTITY
0,100,10,7,52,8
1,101,8,6,125,4
2,102,1,6,125,4
3,103,4,9,125,4
4,104,1,9,113,4
5,105,9,10,87,2
6,106,10,6,39,4
7,107,1,6,50,4
8,108,2,3,31,2
9,109,4,6,37,3
