<a href="https://colab.research.google.com/github/kshitijzutshi/DAMG7245-Assignment-3-Repo/blob/main/docs/notebooks/GRETEL_create_synthetic_data_from_a_dataframe_or_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create synthetic data with the Python SDK

This notebook will walk you through the process of creating your own synthetic data using Gretel's Python SDK from a CSV or a DataFrame of your choosing. 

To run this notebook, you will need an API key from the Gretel console,  at https://console.gretel.cloud.



In [1]:
%%capture
!pip install -U gretel-client

In [2]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

                            

Enter Gretel API key··········


In [8]:
# Create a project

from gretel_client import create_project

project = create_project(display_name="synthetic-data")

## Create the synthetic data configuration
Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics

In [5]:
import json
from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Set the model epochs to 50
config['models'][0]['synthetics']['params']['epochs'] = 50

print(json.dumps(config, indent=2))

{
  "schema_version": "1.0",
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 50,
          "batch_size": 64,
          "vocab_size": 20000,
          "reset_states": false,
          "learning_rate": 0.01,
          "rnn_units": 256,
          "dropout_rate": 0.2,
          "overwrite": true,
          "early_stopping": true,
          "gen_temp": 1.0,
          "predict_batch_size": 64,
          "validation_split": false,
          "dp": false,
          "dp_noise_multiplier": 0.001,
          "dp_l2_norm_clip": 5.0,
          "dp_microbatches": 1,
          "data_upsample_limit": 10000
        },
        "validators": {
          "in_set_count": 10,
          "pattern_count": 10
        },
        "generate": {
          "num_records": 5000,
          "max_invalid": null
        },
        "privacy_filters": {
          "outliers": "medium",
          "similarity": "medium"
        }
      }
    }
  ]
}


## Load and preview the source dataset
Specify a data source to train the model on. This can be a local file, web location, or HDFS file.


In [6]:
# Load and preview the DataFrame to train the synthetic model on.
import pandas as pd

dataset_path = '/content/train.csv'
# dataset_path = 'https://gretel-public-website.s3-us-west-2.amazonaws.com/datasets/USAdultIncome5k.csv'
df = pd.read_csv(dataset_path)
df.to_csv('training_data.csv', index=False)
df

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,MT,83,area_code_415,no,no,0,188.3,70,32.01,243.8,88,20.72,213.7,79,9.62,10.3,6,2.78,0,no
4246,WV,73,area_code_408,no,no,0,177.9,89,30.24,131.2,82,11.15,186.2,89,8.38,11.5,6,3.11,3,no
4247,NC,75,area_code_408,no,no,0,170.7,101,29.02,193.1,126,16.41,129.1,104,5.81,6.9,7,1.86,1,no
4248,HI,50,area_code_408,no,yes,40,235.7,127,40.07,223.0,126,18.96,297.5,116,13.39,9.9,5,2.67,2,no


## Train the synthetic model
In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.

In [9]:
from gretel_client.helpers import poll

model = project.create_model_obj(model_config=config)
model.data_source = 'training_data.csv'
model.submit(upload_data_source=True)

poll(model)

[32mINFO: [0mStarting poller


{
    "uid": "621d3ae508ddd4b2c4ab007f",
    "guid": "model_25ktDAIFvmOPfVwBPuu81v3bHyo",
    "model_name": "shaggy-quizzical-dingo",
    "runner_mode": "cloud",
    "user_id": "621d391bbff6213002668507",
    "user_guid": "user_25ksHQGND5uMVGKEdPGyWXSx93n",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "621d3ada2199163c444ddcf9",
    "project_guid": "proj_25ktBiepgKXQkyKtAakQpBEEhxA",
    "status_history": {
        "created": "2022-02-28T21:13:09.159589Z"
    },
    "last_modified": "2022-02-28T21:13:09.166345Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:8ffc21c9fa07107890499536146d776f237ef75ebf2615ce80f17921967bcae7",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
          

[32mINFO: [0mStatus is created. Model creation has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2022-02-28T21:13:30.917524Z  Starting synthetic model training
2022-02-28T21:13:30.922135Z  Loading training data
2022-02-28T21:13:31.104783Z  Training data loaded, detected format: 'csv'
2022-02-28T21:13:31.114570Z  Training data loaded
{
    "record_count": 4250,
    "field_count": 20,
    "upsample_count": 5750
}
2022-02-28T21:13:34.729526Z  Creating semantic validators and preparing training data
2022-02-28T21:13:47.497491Z  Beginning ML model training
2022-02-28T21:13:59.053467Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.2454,
    "loss": 3.9648,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-02-28T21:14:02.666425Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.493,
    "loss": 2.4657,
    "val

# View the generated synthetic data

In [10]:
# View the synthetic data

synthetic_df = pd.read_csv(model.get_artifact_link("data_preview"), compression='gzip')

synthetic_df

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,NY,107,area_code_415,no,no,0,255.4,97,43.42,270.9,67,23.03,263.4,100,11.85,14.4,5,3.89,1,no
1,ND,59,area_code_415,no,no,0,201.4,98,34.24,281.1,96,23.89,282.5,73,12.71,11.9,9,3.21,1,no
2,CT,41,area_code_408,no,no,0,166.0,100,28.22,270.4,108,22.98,195.0,85,8.78,10.0,4,2.70,1,no
3,CO,95,area_code_408,yes,no,0,121.4,104,20.64,97.3,93,8.27,257.7,78,11.50,10.1,7,2.73,1,no
4,CO,152,area_code_408,no,no,0,176.3,84,29.95,180.8,97,15.37,255.3,105,11.48,10.4,6,2.81,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,OH,159,area_code_415,no,no,0,189.6,101,32.23,229.7,119,19.52,269.2,89,12.11,5.4,2,1.46,1,no
4996,AZ,131,area_code_415,yes,no,0,75.6,119,13.57,317.2,124,26.94,202.8,94,9.13,11.6,5,3.13,2,no
4997,NV,98,area_code_510,no,no,0,124.4,110,21.15,216.8,91,18.43,221.6,145,9.97,11.7,8,3.16,2,no
4998,CO,172,area_code_510,no,no,0,184.6,111,31.38,152.0,92,12.92,191.4,141,8.61,12.3,8,3.32,1,no


# View the synthetic data quality report

In [11]:
# Generate report that shows the statistical performance between the training and synthetic data

import IPython
from smart_open import open

IPython.display.HTML(data=open(model.get_artifact_link("report")).read())

0,1,2,3,4,5
Synthetic Data Use Cases,Excellent,Good,Moderate,Poor,Very Poor
Significant tuning required to improve model,,,,,
Improve your model using our tips and advice,,,,,
Demo environments or mock data,,,,,
Pre-production testing environments,,,,,
Balance or augment machine learning data sources,,,,,
Machine learning or statistical analysis,,,,,

0,1,2,3,4
Data Sharing Use Case,Excellent,Very Good,Good,Normal
"Internally, within the same team",,,,
"Internally, across different teams",,,,
"Externally, with trusted partners",,,,
"Externally, public availability",,,,

Unnamed: 0,Training Data,Synthetic Data
Row Count,4250,4250
Column Count,20,20
Training Lines Duplicated,--,0

Default Privacy Protections,Advanced Protections

Field,Unique,Missing,Ave. Length,Type,Distribution Stability
account_length,215,0,2.49,Categorical,Excellent
total_day_calls,120,0,2.51,Categorical,Excellent
total_eve_calls,123,0,2.52,Categorical,Excellent
state,51,0,2.0,Categorical,Excellent
total_night_calls,128,0,2.51,Categorical,Excellent
total_day_charge,1843,0,4.88,Numeric,Excellent
total_day_minutes,1843,0,4.93,Numeric,Excellent
total_eve_minutes,1773,0,4.98,Numeric,Excellent
total_night_charge,992,0,4.23,Numeric,Excellent
total_eve_charge,1572,0,4.85,Numeric,Excellent


# Generate unlimited synthetic data
You can now use the trained synthetic model to generate as much synthetic data as you like.

In [12]:
# Generate more records from the model

record_handler = model.create_record_handler_obj()

record_handler.submit(
    action="generate",
    params={"num_records": 100, "max_invalid": 500}
)

poll(record_handler)

[32mINFO: [0mStarting poller


{
    "uid": "621d415c04f0292e4fbb1569",
    "guid": "model_run_25kwZ5B5droAaK2Osx9lkBxjciX",
    "model_name": null,
    "runner_mode": "cloud",
    "user_id": "621d391bbff6213002668507",
    "user_guid": "user_25ksHQGND5uMVGKEdPGyWXSx93n",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "621d3ada2199163c444ddcf9",
    "project_guid": "proj_25ktBiepgKXQkyKtAakQpBEEhxA",
    "status_history": {
        "created": "2022-02-28T21:40:44.786000Z"
    },
    "last_modified": "2022-02-28T21:40:44.919000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:8ffc21c9fa07107890499536146d776f237ef75ebf2615ce80f17921967bcae7",
    "model_id": "621d3ae508ddd4b2c4ab007f",
    "model_guid": "model_25ktDAIFvmOPfVwBPuu81v3bHyo",
    "action": "generate",
    "config": {
        

[32mINFO: [0mStatus is created. A Record generation job has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
[32mINFO: [0mStatus is active. A worker has started!
2022-02-28T21:41:08.154644Z  Loading model to worker
2022-02-28T21:41:08.797687Z  Checking for synthetic smart seeds
2022-02-28T21:41:08.798053Z  No smart seeds provided, will attempt generation without them
2022-02-28T21:41:08.798883Z  Loading model
2022-02-28T21:41:11.077949Z  Generating records
{
    "num_records": 100
}
2022-02-28T21:41:16.089182Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-02-28T21:41:21.096755Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-02-28T21:41:23.100054

In [13]:
synthetic_df = pd.read_csv(record_handler.get_artifact_link("data"), compression='gzip')

synthetic_df

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,MA,90,area_code_415,no,no,0,181.8,82,30.91,285.5,88,24.24,209.0,65,9.40,9.7,2,2.62,2,no
1,PA,137,area_code_415,yes,no,0,167.7,106,28.51,141.2,80,12.00,169.2,67,7.61,9.7,3,2.62,0,no
2,TX,69,area_code_408,no,no,0,252.4,73,42.91,72.2,89,6.34,214.1,78,9.63,12.7,7,3.43,1,no
3,SC,91,area_code_408,no,no,0,174.7,87,29.70,124.2,138,10.56,239.9,95,10.80,9.5,8,2.57,1,no
4,NV,78,area_code_415,no,no,0,279.7,96,50.92,249.6,106,21.25,251.9,111,11.34,7.4,3,2.00,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,NJ,107,area_code_510,no,no,0,147.8,92,25.13,254.2,116,21.61,127.1,100,5.72,11.3,2,3.05,3,yes
96,MN,71,area_code_415,no,no,0,215.6,110,36.65,210.0,102,17.85,256.2,110,11.53,11.8,4,3.19,0,no
97,ID,126,area_code_510,no,no,0,192.4,97,32.71,226.1,78,19.26,313.2,99,14.08,10.9,4,2.94,2,no
98,NC,133,area_code_510,no,no,0,141.4,131,24.04,253.0,112,21.51,99.7,104,4.49,5.7,4,1.54,1,no
