In [1]:
!pip install -U gretel-client

Collecting gretel-client
  Downloading gretel_client-0.17.9-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting backports.cached-property==1.0.0.post2 (from gretel-client)
  Downloading backports.cached_property-1.0.0.post2-py3-none-any.whl (5.6 kB)
Collecting click==8.1.3 (from gretel-client)
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker==6.1.2 (from gretel-client)
  Downloading docker-6.1.2-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.1/148.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting kubernetes==28.1.0 (from gretel-client)
  Downloading kubernetes-28.1.0-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB

In [2]:
import pandas as pd

from gretel_client.config import RunnerMode
from gretel_client.evaluation.quality_report import QualityReport
from gretel_client import configure_session
from gretel_client.projects import create_or_get_unique_project

In [3]:
# Specify your Gretel API Key

pd.set_option("max_colwidth", None)

configure_session(api_key="prompt", cache="yes", validate=True)



Gretel API Key: ··········
Caching Gretel config to disk.
Using endpoint https://api.gretel.cloud
Logged in as chinie876@gmail.com ✅


In [4]:
# Load and preview real-world data

real_data = "https://gretel-public-website.s3.us-west-2.amazonaws.com/datasets/USAdultIncome5k.csv"

real_df = pd.read_csv(real_data)
real_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,42,Private,255847,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,4386,0,48,United-States,>50K
1,34,Private,111567,HS-grad,9,Never-married,Transport-moving,Own-child,White,Male,0,0,40,United-States,<=50K
2,34,Private,263307,Bachelors,13,Never-married,Sales,Unmarried,Black,Male,0,0,45,?,<=50K
3,69,Private,174474,10th,6,Separated,Machine-op-inspct,Not-in-family,White,Female,0,0,28,Peru,<=50K
4,26,Private,260614,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,42,Self-emp-inc,287037,12th,8,Divorced,Craft-repair,Not-in-family,White,Male,0,0,10,United-States,<=50K
4996,48,Private,236858,11th,7,Divorced,Other-service,Not-in-family,White,Female,0,0,31,United-States,<=50K
4997,53,Private,317313,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,60,United-States,>50K
4998,23,Private,113601,Some-college,10,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States,<=50K


In [5]:

# Load and preview synthetic data

synth_data = "https://gretel-public-website.s3.us-west-2.amazonaws.com/datasets/USAdultIncome5kGenerated.csv"

synth_df = pd.read_csv(synth_data)
synth_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,29,Private,179541.0,11th,7,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K
1,17,?,143604.0,10th,6,Never-married,?,Own-child,White,Male,0,0,12,United-States,<=50K
2,80,?,242001.0,Masters,14,Widowed,?,Not-in-family,Other,Male,0,0,48,United-States,<=50K
3,27,?,143058.0,11th,7,Never-married,?,Own-child,White,Female,0,0,40,United-States,<=50K
4,29,Private,116834.0,HS-grad,9,Never-married,?,Not-in-family,White,Male,0,0,35,?,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,49,Private,94413.0,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,>50K
4996,42,Private,31621.0,Bachelors,13,Separated,Sales,Own-child,Black,Female,0,0,35,United-States,<=50K
4997,35,Private,167967.0,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,35,United-States,>50K
4998,37,Private,213640.0,Some-college,10,Divorced,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K


In [6]:
report = QualityReport(data_source=synth_data, ref_data=real_data)
report.run()

In [7]:
report.peek()

{'raw_score': 90.52592592592593, 'grade': 'Excellent', 'score': 90}

In [8]:


import IPython

IPython.display.HTML(report.as_html, metadata=dict(isolated=True))



0,1,2,3,4,5
How to interpret your SQS,Excellent,Good,Moderate,Poor,Very Poor
Suitable for machine learning or statistical analysis,,,,,
Suitable for balancing or augmenting machine learning data sources,,,,,
Suitable for pre-production testing environments,,,,,
Suitable for demo environments or mock data,,,,,
Improve your model using our tips and advice,,,,,
Significant tuning required to improve model,,,,,

0,1,2,3,4,5
Data Sharing Use Case,Excellent,Very Good,Good,Normal,Poor
"Internally, within the same team",,,,,
"Internally, across different teams",,,,,
"Externally, with trusted partners",,,,,
"Externally, public availability",,,,,

Unnamed: 0,Training Data,Synthetic Data
Row Count,5000,5000
Column Count,15,15
Training Lines Duplicated,--,0

Default Privacy Protections,Advanced Protections

Field,Unique,Missing,Ave. Length,Type,Distribution Stability
education_num,16,0,1.55,Numeric,Excellent
education,16,0,8.43,Categorical,Excellent
native_country,40,0,12.3,Categorical,Excellent
income_bracket,2,0,4.76,Binary,Excellent
hours_per_week,82,0,1.98,Numeric,Excellent
occupation,15,0,12.18,Categorical,Excellent
relationship,6,0,9.1,Categorical,Excellent
age,70,0,2.0,Numeric,Excellent
capital_loss,53,0,1.14,Numeric,Excellent
marital_status,7,0,14.52,Categorical,Excellent
