In [112]:
import pandas as pd

data = pd.DataFrame({
    'integer': [1, 2, 1, 2, 1, 2, 3, 1],
    'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.3, 0.1],
    'categorical': ['a', 'b', 'a', 'b', 'a', None, 'c', None],
    'bool': [False, True, False, True, False, False, False, False],
    'nullable': [1, None, 3, None, 5, None, 7, None],
    'datetime': [
        '2010-01-01', '2010-02-01', '2010-01-01', '2010-02-01',
        '2010-01-01', '2010-02-01', '2010-03-01', '2010-02-01'
    ]
})

In [113]:
data

Unnamed: 0,integer,float,categorical,bool,nullable,datetime
0,1,0.1,a,False,1.0,2010-01-01
1,2,0.2,b,True,,2010-02-01
2,1,0.1,a,False,3.0,2010-01-01
3,2,0.2,b,True,,2010-02-01
4,1,0.1,a,False,5.0,2010-01-01
5,2,0.2,,False,,2010-02-01
6,3,0.3,c,False,7.0,2010-03-01
7,1,0.1,,False,,2010-02-01


In [114]:
data.to_csv('data.csv', index=False)

In [115]:
import json

metadata = {
    "path": "",
    "tables": [
        {
            "fields": [
                {
                    "name": "integer",
                    "type": "number",
                    "subtype": "integer",
                },
                {
                    "name": "float",
                    "type": "number",
                    "subtype": "float",
                },
                {
                    "name": "categorical",
                    "type": "categorical",
                    "subtype": "categorical",
                    "pii": False,
                    "pii_category": "email"
                },
                {
                    "name": "bool",
                    "type": "categorical",
                    "subtype": "bool",
                },
                {
                    "name": "nullable",
                    "type": "number",
                    "subtype": "float",
                },
                {
                    "name": "datetime",
                    "type": "datetime",
                    "format": "%Y-%m-%d"
                },
            ],
            "headers": True,
            "name": "data",
            "path": "data.csv",
            "use": True
        }
    ]
}

with open('data.meta.json', 'w') as f:
    json.dump(metadata, f, indent=4)

In [116]:
from sdv import SDV

sdv = SDV('data.meta.json')
sdv.fit()

2019-10-06 20:04:23,498 - INFO - modeler - Modeling data
2019-10-06 20:04:23,514 - INFO - modeler - Modeling Complete


In [117]:
samples = sdv.sample_all()
samples['data']

Unnamed: 0,integer,float,categorical,bool,nullable,datetime
0,0,0.043007,c,False,1.486208,2009-12-18 07:23:53.847842560
1,2,0.194331,,False,,2010-02-04 08:07:30.073904384
2,2,0.205899,,False,5.341036,2010-02-05 08:26:48.177382656
3,1,0.111931,a,True,3.478437,2010-01-02 01:40:28.389134080
4,1,0.098118,a,False,5.343904,2010-01-09 04:32:33.057079296


In [118]:
from sdv import SDV

sdv = SDV('../tests/data/meta.json')
sdv.fit()

2019-10-06 20:04:24,706 - INFO - modeler - Modeling DEMO_ORDER_ITEMS
2019-10-06 20:04:24,707 - INFO - modeler - Modeling DEMO_ORDERS
2019-10-06 20:04:24,791 - INFO - modeler - Modeling DEMO_CUSTOMERS
2019-10-06 20:04:25,359 - INFO - modeler - Modeling Complete


In [122]:
samples = sdv.sample_all()
samples['DEMO_CUSTOMERS']

Unnamed: 0,CUSTOMER_ID,CUST_POSTAL_CODE,PHONE_NUMBER1,CREDIT_LIMIT,COUNTRY
0,15,38685,6979853673,28,UK
1,16,50000,5726480170,1072,SPAIN
2,17,49861,7521355972,106,UK
3,18,33329,7882661650,1769,UK
4,19,-19052,2816858140,1276,UK


In [97]:
real = sdv.dn.get_tables()

In [70]:
real

{'DEMO_CUSTOMERS':    CUSTOMER_ID  CUST_POSTAL_CODE  PHONE_NUMBER1  CREDIT_LIMIT COUNTRY
 0           50             11371     6175553295          1000      UK
 1            4             63145     8605551835           500      US
 2     97338810              6096     7035552143          1000  CANADA
 3       630407             63145     7035552143          2000      US
 4       826362             11371     6175553295          1000      UK
 5     55996144             20166     4045553285          1000  FRANCE
 6       598112             63145     6175553295          1000   SPAIN,
 'DEMO_ORDERS':    ORDER_ID  CUSTOMER_ID  ORDER_TOTAL
 0         1           50         2310
 1         2            4         1507
 2        10     97338810          730
 3         6     55996144          730
 4         3     55996144          939
 5         4           50         2380
 6         5     97338810         1570
 7         7           50          730
 8         8     97338810         2336
 9      

In [74]:
from sdv.evaluation import evaluate

evaluate(real, samples).astype(str)

mse          295644032004198.2
rmse        17194302.312225357
r2_score    0.9964691952801145
dtype: object