In [None]:
! pip install numpy scikit-learn

# Load toy dataset

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

seed = 42
X = load_diabetes(as_frame=True).frame
discrete_features = ["sex"]
target_column = "target"
X_train, X_test = train_test_split(X, test_size=0.2, random_state=seed)
X_train.head()

In [None]:
! pip install synthyverse[arf]

# Import and train synthetic data generator

In [None]:
from synthyverse.generators import ARFGenerator

generator = ARFGenerator(num_trees=20, random_state=0)
generator.fit(X_train, discrete_features=["target"])

syn = generator.generate(len(X))
syn.head()

Each generator also takes parameters related to preprocessing: 
- how to handle missing values
- whether to enforce any constraints
- how to handle mixed numerical-discrete features
- whether to normalize numerical features through quantile transformation

The last two can be especially important for synthetic data generators which rely on continuous numerical input distributions, such as deep generative models.

In [None]:
! pip install synthyverse[ctgan]

In [None]:
from synthyverse.generators import CTGANGenerator
import numpy as np

# add some missing values to the first few columns
for i in range(25):
    X_train.iloc[i, np.random.randint(0, X_train.shape[1] - 5)] = np.nan

generator = CTGANGenerator(
    constraints=["s1>=s2+s3"],  # enforce a constraint on the synthetic data
    missing_imputation_method="random",  # random imputation of missing values
    retain_missingness=True,  # retain missing values in the synthetic data
    encode_mixed_numerical_features=True,
    quantile_transform_numericals=True,
    random_state=0,
)
generator.fit(X_train, discrete_features=["target"])

syn = generator.generate(len(X))
syn.head()

# Evaluate synthetic data quality

In [None]:
from synthyverse.evaluation import TabularMetricEvaluator

metrics = ["mle", "dcr", "similarity"]
metrics = {
    "mle-trts": {"train_set": "real"},
    "mle-tstr": {"train_set": "synthetic"},
    "dcr": {"estimates": ["mean", 0.01, 0.05]},
    "similarity": {},
}
evaluator = TabularMetricEvaluator(metrics, discrete_features, target_column, seed)
results = evaluator.evaluate(X_train, X_test, syn)
results

# Unified pipeline for synthetic data generation and evaluation

In [None]:
from synthyverse.benchmark import TabularBenchmark

benchmark = TabularBenchmark(
    generator_name="arf",
    generator_params={},
    n_random_splits=1,
    n_inits=1,
    n_generated_datasets=1,
    metrics=["mle", "similarity", "classifier_test"],
    test_size=0.2,
    val_size=0.1,
    missing_imputation_method="drop",
    retain_missingness=False,
    encode_mixed_numerical_features=False,
    quantile_transform_numericals=False,
    constraints=[],
)
results = benchmark.run(
    X, target_column=target_column, discrete_columns=discrete_features
)
results