In [1]:
from b2aiprep.synthetic import (
    fit_synthesizer,
    generate_tabular_data,
    run_diagnostics,
    evaluate_data,
    get_column_plots
)
import os

In [2]:
source_data_csv_path = os.getcwd() + "/../data/test/tiny.csv"
synthetic_data_path = os.getcwd() + "/../data/test/tiny_synthetic.csv"
synthetic_data_diagnostic_path = os.getcwd() + "/../data/test/tiny_synthetic_diagnostic.json"
synthetic_data_evaluation_path = os.getcwd() + "/../data/test/tiny_synthetic_evaluation.json"
synthetic_data_column_plot_path = os.getcwd() + "/../data/test/"
n_synthetic_rows = 3
synthesizer_path = os.getcwd() + "/../models/synthesizers/test_tiny_CTGAN_synthesizer.pkl"

#### Train a synthesizer model that learns the distribution of the data:

In [3]:
fit_synthesizer(source_data_csv_path, refit=False, synthesizer_path=synthesizer_path)

INFO:b2aiprep.synthetic.synthetic:Metadata detected from source data CSV.
INFO:b2aiprep.synthetic.synthetic:Loaded synthesizer from file.


<sdv.single_table.ctgan.CTGANSynthesizer at 0x329c86d10>

#### Sample from the trained synthesizer model to get synthetic data:

In [4]:
generate_tabular_data(
    n_synthetic_rows=n_synthetic_rows,
    synthetic_data_path=synthetic_data_path,
    synthesizer_path=synthesizer_path
)

INFO:b2aiprep.synthetic.synthetic:Sampling 3 rows with the synthesizer...
INFO:SingleTableSynthesizer:{'EVENT': 'Sample', 'TIMESTAMP': datetime.datetime(2024, 10, 24, 13, 53, 54, 306756), 'SYNTHESIZER CLASS NAME': 'CTGANSynthesizer', 'SYNTHESIZER ID': 'CTGANSynthesizer_1.13.1_48aaf33418374fafa254c26dbbc3cea7', 'TOTAL NUMBER OF TABLES': 1, 'TOTAL NUMBER OF ROWS': 3, 'TOTAL NUMBER OF COLUMNS': 3}
INFO:b2aiprep.synthetic.synthetic:3 rows sampled.
INFO:b2aiprep.synthetic.synthetic:Data saved to /Users/isaacbevers/sensein/b2ai-wrapper/b2aiprep/docs/../data/test/tiny_synthetic.csv.


Unnamed: 0,id,name,age
0,4,Mike,39
1,3,John,28
2,3,Mike,45


#### Run diagnostics to ensure that synthetic data is valid:

In [5]:

run_diagnostics(synthetic_data_path, source_data_csv_path, diagnostic_report_path=synthetic_data_diagnostic_path)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 3/3 [00:00<00:00, 2087.41it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 800.59it/s]|
Data Structure Score: 100.0%



INFO:b2aiprep.synthetic.synthetic:Basic validity checks completed.
INFO:b2aiprep.synthetic.synthetic:Diagnostic report saved to /Users/isaacbevers/sensein/b2ai-wrapper/b2aiprep/docs/../data/test/tiny_synthetic_diagnostic.json.


Overall Score (Average): 100.0%



{'report_type': 'DiagnosticReport',
 'generated_date': '2024-10-24',
 'sdmetrics_version': '0.14.1',
 'num_rows_real_data': 4,
 'num_rows_synthetic_data': 3,
 'generation_time': 0.005666017532348633,
 'score': 1.0}

#### Evaluate how well the synthetic data matches the distribution of the real data:

In [6]:
evaluate_data(synthetic_data_path, source_data_csv_path, evaluation_report_path=synthetic_data_evaluation_path)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 3/3 [00:00<00:00, 1227.48it/s]|
Column Shapes Score: 52.78%

(2/2) Evaluating Column Pair Trends: |██████████| 3/3 [00:00<00:00, 179.02it/s]|
Column Pair Trends Score: 57.26%



INFO:b2aiprep.synthetic.synthetic:Evaluation report saved to /Users/isaacbevers/sensein/b2ai-wrapper/b2aiprep/docs/../data/test/tiny_synthetic_evaluation.json.


Overall Score (Average): 55.02%



{'report_type': 'QualityReport',
 'generated_date': '2024-10-24',
 'sdmetrics_version': '0.14.1',
 'num_rows_real_data': 4,
 'num_rows_synthetic_data': 3,
 'generation_time': 0.021953105926513672,
 'score': 0.550169275730512}

#### Get plots of the distribution of the real data with the synthetic data for comparison:

In [7]:
get_column_plots(synthetic_data_path, source_data_csv_path, save_directory=synthetic_data_column_plot_path)

INFO:b2aiprep.synthetic.synthetic:Plot saved for column 'id' at: /Users/isaacbevers/sensein/b2ai-wrapper/b2aiprep/docs/../data/test/id_column_real_and_synthetic_distribution_plot.png
INFO:b2aiprep.synthetic.synthetic:Plot saved for column 'name' at: /Users/isaacbevers/sensein/b2ai-wrapper/b2aiprep/docs/../data/test/name_column_real_and_synthetic_distribution_plot.png
INFO:b2aiprep.synthetic.synthetic:Plot saved for column 'age' at: /Users/isaacbevers/sensein/b2ai-wrapper/b2aiprep/docs/../data/test/age_column_real_and_synthetic_distribution_plot.png


[Figure({
     'data': [{'fill': 'tozeroy',
               'hovertemplate': '<b>Real</b><br>Frequency: %{y}<extra></extra>',
               'legendgroup': 'Real',
               'marker': {'color': '#000036'},
               'mode': 'lines',
               'name': 'Real',
               'showlegend': True,
               'type': 'scatter',
               'x': [1.0, 1.006, 1.012, ..., 3.982, 3.988, 3.994],
               'xaxis': 'x',
               'y': array([0.17594505, 0.17649864, 0.17705029, ..., 0.17759998, 0.17705029,
                           0.17649864]),
               'yaxis': 'y'},
              {'fill': 'tozeroy',
               'hovertemplate': '<b>Synthetic</b><br>Frequency: %{y}<extra></extra>',
               'legendgroup': 'Synthetic',
               'marker': {'color': '#01E0C9'},
               'mode': 'lines',
               'name': 'Synthetic',
               'showlegend': True,
               'type': 'scatter',
               'x': [3.0, 3.002, 3.004, ..., 3.99399