In [1]:
import pandas as pd
import numpy as np

data = pd.DataFrame({
    'integer': [1, None, 1, 2, 1, 2, 3, 2],
    'float': [0.1, None, 0.1, 0.2, 0.1, 0.2, 0.3, 0.1],
    'categorical': ['a', 'b', 'a', 'b', 'a', None, 'c', None],
    'bool': [False, True, False, True, False, False, False, None],
    'nullable': [1, None, 3, None, 5, None, 7, None],
    'datetime': [
        '2010-01-01', '2010-02-01', '2010-01-01', '2010-02-01',
        '2010-01-01', '2010-02-01', '2010-03-01', None
    ]
})

In [2]:
data

Unnamed: 0,integer,float,categorical,bool,nullable,datetime
0,1.0,0.1,a,False,1.0,2010-01-01
1,,,b,True,,2010-02-01
2,1.0,0.1,a,False,3.0,2010-01-01
3,2.0,0.2,b,True,,2010-02-01
4,1.0,0.1,a,False,5.0,2010-01-01
5,2.0,0.2,,False,,2010-02-01
6,3.0,0.3,c,False,7.0,2010-03-01
7,2.0,0.1,,,,


In [3]:
data.to_csv('data.csv', index=False)

In [3]:
import json

metadata = {
    "path": "",
    "tables": [
        {
            "fields": [
                {
                    "name": "integer",
                    "type": "number",
                    "subtype": "integer",
                },
                {
                    "name": "float",
                    "type": "number",
                    "subtype": "float",
                },
                {
                    "name": "categorical",
                    "type": "categorical",
                    "subtype": "categorical",
                    "pii": False,
                    "pii_category": "email"
                },
                {
                    "name": "bool",
                    "type": "categorical",
                    "subtype": "bool",
                },
                {
                    "name": "nullable",
                    "type": "number",
                    "subtype": "float",
                },
                {
                    "name": "datetime",
                    "type": "datetime",
                    "format": "%Y-%m-%d"
                },
            ],
            "headers": True,
            "name": "data",
            "path": "data.csv",
            "use": True
        }
    ]
}

with open('data.meta.json', 'w') as f:
    json.dump(metadata, f, indent=4)

In [4]:
from sdv import SDV

sdv = SDV()
sdv.fit(metadata)

2019-10-08 15:24:15,275 - INFO - modeler - Modeling data
2019-10-08 15:24:15,276 - INFO - metadata - Loading table data
2019-10-08 15:24:15,321 - INFO - modeler - Modeling Complete


In [5]:
samples = sdv.sample_all()
samples['data']

Unnamed: 0,integer,float,categorical,bool,nullable,datetime
0,2.0,0.185731,b,False,2.753619,2010-01-16 01:08:55.990858496
1,2.0,0.11346,b,False,,2010-01-12 11:28:24.330070272
2,1.0,0.094154,,False,,2010-01-07 02:23:58.552935680
3,1.0,0.073905,a,False,,2010-01-10 07:28:18.259157504
4,2.0,0.224952,b,True,5.190492,2010-01-23 23:52:17.919372800


In [6]:
from sdv.evaluation import evaluate

real = sdv.metadata.get_tables()

samples = sdv.sample_all(len(real['data']))

evaluate(real, samples).astype(str)

2019-10-08 15:24:17,617 - INFO - metadata - Loading table data


mse          0.1654370411227441
rmse        0.40673952490844073
r2_score         0.611632793267
dtype: object

In [7]:
from sdv import SDV

sdv = SDV()
sdv.fit('../tests/data/meta.json')

2019-10-08 15:24:18,521 - INFO - modeler - Modeling DEMO_ORDER_ITEMS
2019-10-08 15:24:18,522 - INFO - metadata - Loading table DEMO_ORDER_ITEMS
2019-10-08 15:24:18,529 - INFO - modeler - Modeling DEMO_ORDERS
2019-10-08 15:24:18,530 - INFO - metadata - Loading table DEMO_ORDERS
2019-10-08 15:24:18,616 - INFO - modeler - Modeling DEMO_CUSTOMERS
2019-10-08 15:24:18,616 - INFO - metadata - Loading table DEMO_CUSTOMERS
2019-10-08 15:24:19,169 - INFO - modeler - Modeling Complete


In [17]:
from sdv.evaluation import evaluate

real = sdv.metadata.get_tables()

samples = sdv.sample_all(len(real['DEMO_CUSTOMERS']), reset_primary_keys=True)

evaluate(real, samples).astype(str)

2019-10-08 15:25:10,655 - INFO - metadata - Loading table DEMO_CUSTOMERS
2019-10-08 15:25:10,660 - INFO - metadata - Loading table DEMO_ORDERS
2019-10-08 15:25:10,663 - INFO - metadata - Loading table DEMO_ORDER_ITEMS


mse         2.2389034056938003e+61
rmse        4.7317051954805894e+30
r2_score        0.4010487722332118
dtype: object

In [9]:
samples['DEMO_CUSTOMERS'].describe()

Unnamed: 0,CUSTOMER_ID,PHONE_NUMBER1,CREDIT_LIMIT
count,7.0,7.0,7.0
mean,3.0,6585222000.0,668.571429
std,2.160247,609865200.0,344.87817
min,0.0,5686286000.0,325.0
25%,1.5,6162029000.0,459.5
50%,3.0,6559897000.0,539.0
75%,4.5,7143657000.0,784.0
max,6.0,7239000000.0,1329.0


In [10]:
real['DEMO_CUSTOMERS'].describe()

Unnamed: 0,CUSTOMER_ID,PHONE_NUMBER1,CREDIT_LIMIT
count,7.0,7.0,7.0
mean,22198560.0,6464124000.0,1071.428571
std,39077740.0,1374656000.0,449.867705
min,4.0,4045553000.0,500.0
25%,299081.0,6175553000.0,1000.0
50%,630407.0,6175553000.0,1000.0
75%,28411250.0,7035552000.0,1000.0
max,97338810.0,8605552000.0,2000.0


In [11]:
samples['DEMO_ORDERS'].describe()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,ORDER_TOTAL
count,17.0,17.0,17.0
mean,8.0,2.941176,1372.705882
std,5.049752,2.164214,597.027508
min,0.0,0.0,271.0
25%,4.0,1.0,1001.0
50%,8.0,3.0,1375.0
75%,12.0,5.0,1780.0
max,16.0,6.0,2542.0


In [12]:
real['DEMO_ORDERS'].describe()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,ORDER_TOTAL
count,10.0,10.0,10.0
mean,5.5,40400890.0,1397.5
std,3.02765,45182780.0,722.322835
min,1.0,4.0,730.0
25%,3.25,50.0,733.25
50%,5.5,27998100.0,1223.0
75%,7.75,87003140.0,2125.0
max,10.0,97338810.0,2380.0


In [13]:
samples['DEMO_ORDER_ITEMS']

Unnamed: 0,ORDER_ITEM_ID,ORDER_ID,PRODUCT_ID,UNIT_PRICE,QUANTITY
0,0,0.0,8.0,143.0,2.0
1,1,0.0,10.0,105.0,3.0
2,2,0.0,12.0,44.0,0.0
3,3,0.0,8.0,90.0,4.0
4,4,0.0,6.0,184.0,5.0
5,5,0.0,11.0,109.0,3.0
6,6,0.0,6.0,103.0,4.0
7,7,0.0,10.0,68.0,6.0
8,8,0.0,6.0,84.0,6.0
9,9,1.0,9.0,99.0,4.0


In [14]:
real['DEMO_ORDER_ITEMS']

Unnamed: 0,ORDER_ITEM_ID,ORDER_ID,PRODUCT_ID,UNIT_PRICE,QUANTITY
0,100,10,7,52,8
1,101,8,6,125,4
2,102,1,6,125,4
3,103,4,9,125,4
4,104,1,9,113,4
5,105,9,10,87,2
6,106,10,6,39,4
7,107,1,6,50,4
8,108,2,3,31,2
9,109,4,6,37,3
