# Create a synthetic version of your own CSV or DataFrame

This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. Our SDKs create automatic data validators to help ensure the data generated has the same semantics as the source data. Additionally, the SDKs do autmoatic header clustering to help maintain statistical relations between columns.

In [1]:
!pip install -U gretel-client gretel-synthetics pandas

Requirement already up-to-date: gretel-client in /usr/local/lib/python3.7/dist-packages (0.7.12)
Requirement already up-to-date: gretel-synthetics in /usr/local/lib/python3.7/dist-packages (0.15.6)
Requirement already up-to-date: pandas in /usr/local/lib/python3.7/dist-packages (1.2.4)


In [2]:
# Load your Gretel API key. You can acquire this from the Gretel Console 
# @ https://console.gretel.cloud

import pandas as pd
from gretel_client import get_cloud_client

pd.set_option('max_colwidth', None)

client = get_cloud_client(prefix="api", api_key="prompt")
client.install_packages()

Enter Gretel API key: ··········


INFO pkg_installers.py: Authenticating with package manager
INFO pkg_installers.py: Installing packages (this might take a while)
ERROR pkg_installers.py: /usr/bin/python3 -m pip --disable-pip-version-check install https://gretel-opt-prod-usw2.s3.amazonaws.com/priv/pip/gretel-helpers/0.8.2/gretel_helpers-0.8.2-py3-none-any.whl?AWSAccessKeyId=ASIARC2BUADHUP25SVOC&Signature=nDmIrmdSfIss0A%2Fd9klgGaTfEeo%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKb%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIGs5nw9t4pwoe21H9lQJF9NPwIGDhsgvejvhPvxZSVrjAiEAiPjchApJ2NccNaYbDt2Gb7Sd75Lneva8qJZRhCMXIZAq3gEILxACGgwwNzQ3NjI2ODI1NzUiDNhV4%2BWnsGRgK1O8eCq7AaUOC3L5yHsN%2Bm6TK%2FKpi%2Fyre7yJ3EWPwPJDhUeIiWkhT2bhL7wmwlUs40BDSh11c%2BE86dhMxZWPG9JWhxr8t6YZ8WwmeFCrKhjnPm5tV9qYa%2FsvsS6Hm%2BdrHUgVgYnvcuZ4LeBdHwUPEwVggjsjFJTp32JWHXw8faDzWVbfJdGux79SWdGuI3tKInGD0kpQT%2BFR%2Ft2iBqLuEJITpOyRH4nvrFh5GE67A%2FI3IZA8YPB3M%2F60rS2d5CyFwFgw2%2BPPhAY64AE8nStvsBKr3nJVRIEu1j4kS9ZlP9mkYCPVIPo3%2BIAI0PwAU8xX3xPRNq5HUQ5JfOmBl1Wgh

In [4]:
# Load and preview dataset

import pandas as pd

dataset_path = 'https://www.dropbox.com/s/ywppusa8h7oq1e5/SE1_2018.csv?dl=0'
nrows = 10000  # We will use this later when generating data
training_df = pd.read_csv(dataset_path, nrows=nrows)
print(training_df.head())

ParserError: ignored

In [None]:
# Create the Gretel Synthtetics Training / Model Configuration
#
# Gretel now offers Configuration Templates that provide starting points for a variety
# of training data characteristics.
#
# You may browse the options here: https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics
#
# The helper function below will fetch the configuration based on the filename *WITHOUT the file extension*

from pathlib import Path

checkpoint_dir = str(Path.cwd() / "checkpoints-synthetics")

try:
    from gretel_client import get_synthetics_config
    
    # NOTE: Replace the "default" param with any of the configuration filenames (minus extension)
    #
    # https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics
    #
    # example: get_synthetics_config("low-record-count")

    config_template = get_synthetics_config("default")
    print(f"Loaded config: {config_template}")
except ImportError:
    print("ERROR: Could not load remote template, using default params. Please ensure you have the latest gretel-client installed.")
    config_template = {"epochs": 100}
    

# Set or update any custom parameters here
 
config_template["overwrite"] = True

In [None]:
# Capture transient import errors in Google Colab

try:
    from gretel_helpers.synthetics import SyntheticDataBundle
except FileNotFoundError:
    from gretel_helpers.synthetics import SyntheticDataBundle

In [None]:
# Create a Gretel Synthetic Data Bundle

from gretel_helpers.synthetics import create_df, SyntheticDataBundle

model = SyntheticDataBundle(
    training_df=training_df,
    delimiter=None, # if ``None``, it will try and automatically be detected, otherwise you can set it
    auto_validate=True, # build record validators that learn per-column, these are used to ensure generated records have the same composition as the original
    synthetic_config=config_template, # the config for Synthetics
)

In [None]:
model.build()

In [None]:
model.train()

In [None]:
# num_lines: how many rows to generate
# max_invalid: the number of rows that do not pass semantic validation, if this number is exceeded, training will
# stop
model.generate(num_lines=nrows, max_invalid=nrows)

In [None]:
model.get_synthetic_df()

In [None]:
# Generate report that shows the statistical performance between the training and synthetic data
import IPython

report_path = './report.html'
model.generate_report(report_path=report_path)
IPython.display.HTML(filename=report_path)

In [None]:
# Optionally save your model

model.save("my_model.tar.gz")

In [None]:
# Save synthetic dataframe locally and to a private Gretel project 

df = model.get_synthetic_df()
df.to_csv('synthetic-data.csv', index=False)

# Publish newly created synthetic data to a new private Gretel project 
project = client.get_project(display_name="Blueprint: Create Synthetic Data", create=True)
project.send_dataframe(df, detection_mode="all")
print(f"View this project at: {project.get_console_url()}")