In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income'
]

In [3]:
df = pd.read_csv(url, names=columns)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
tables = {
    'census': df
}

In [5]:
metadata = {
    "tables": [
        {
            "fields": [
                {
                    "name": "age",
                    "type": "numerical",
                    "subtype": "integer",
                },
                {
                    "name": "workclass",
                    "type": "categorical",
                },
                {
                    "name": "fnlwgt",
                    "type": "numerical",
                    "subtype": "integer",
                },
                {
                    "name": "education",
                    "type": "categorical",
                },
                {
                    "name": "education-num",
                    "type": "numerical",
                    "subtype": "integer",
                },
                {
                    "name": "marital-status",
                    "type": "categorical",
                },
                {
                    "name": "occupation",
                    "type": "categorical",
                },
                {
                    "name": "relationship",
                    "type": "categorical",
                },
                {
                    "name": "race",
                    "type": "categorical",
                },
                {
                    "name": "sex",
                    "type": "categorical",
                },
                {
                    "name": "capital-gain",
                    "type": "numerical",
                    "subtype": "integer",
                },
                {
                    "name": "capital-loss",
                    "type": "numerical",
                    "subtype": "integer",
                },
                {
                    "name": "hours-per-week",
                    "type": "numerical",
                    "subtype": "integer",
                },
                {
                    "name": "native-country",
                    "type": "categorical",
                },
                {
                    "name": "income",
                    "type": "categorical",
                }
            ],
            "name": "census",
        }
    ]
}

In [6]:
from sdv import SDV

sdv = SDV()
sdv.fit(metadata, tables)

2020-06-25 23:39:05,857 - INFO - modeler - Modeling census
2020-06-25 23:39:05,857 - INFO - metadata - Loading transformer NumericalTransformer for field age
2020-06-25 23:39:05,858 - INFO - metadata - Loading transformer CategoricalTransformer for field workclass
2020-06-25 23:39:05,858 - INFO - metadata - Loading transformer NumericalTransformer for field fnlwgt
2020-06-25 23:39:05,859 - INFO - metadata - Loading transformer CategoricalTransformer for field education
2020-06-25 23:39:05,859 - INFO - metadata - Loading transformer NumericalTransformer for field education-num
2020-06-25 23:39:05,860 - INFO - metadata - Loading transformer CategoricalTransformer for field marital-status
2020-06-25 23:39:05,860 - INFO - metadata - Loading transformer CategoricalTransformer for field occupation
2020-06-25 23:39:05,860 - INFO - metadata - Loading transformer CategoricalTransformer for field relationship
2020-06-25 23:39:05,861 - INFO - metadata - Loading transformer CategoricalTransformer 

In [7]:
sampled = sdv.sample('census', num_rows=len(df))
sampled['census'].head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,42,Private,47585,Assoc-voc,9,Never-married,Prof-specialty,Not-in-family,White,Male,2398,3,44,United-States,<=50K
1,56,Private,92870,HS-grad,13,Married-civ-spouse,Sales,Not-in-family,White,Male,3624,269,30,United-States,<=50K
2,54,Private,218711,HS-grad,13,Never-married,Other-service,Not-in-family,White,Female,4420,-269,43,United-States,<=50K
3,31,?,71625,Some-college,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,3196,898,51,United-States,<=50K
4,37,Private,184276,Bachelors,14,Never-married,Craft-repair,Husband,White,Male,4785,907,32,United-States,<=50K


In [8]:
from sdv.evaluation import evaluate

samples = sdv.sample_all(len(tables['census']))

evaluate(samples, real=tables, metadata=sdv.metadata)

-43.51455835774797