In [28]:
import great_expectations as ge 
import pandas as pd
import random
import os
from pprint import pprint

Create Data

In [6]:
random.seed(87)
gender = ["Male", "Female", "Other"]
blood_sugar_levels = [random.randint(70, 130) for _ in range(20)]
gender = [random.choice(gender) for _ in range(20)]

data = {"Gender": gender, "Blood_sugar_levels": blood_sugar_levels}
df = pd.DataFrame(data)
print(df)

    Gender  Blood_sugar_levels
0   Female                  79
1     Male                 117
2   Female                  82
3   Female                 126
4     Male                 123
5   Female                 122
6    Other                 104
7    Other                  76
8    Other                  87
9     Male                 118
10   Other                  91
11    Male                 111
12  Female                 104
13    Male                 115
14   Other                  90
15   Other                 126
16    Male                 115
17  Female                 128
18    Male                  77
19   Other                 127


In [16]:
# import df to great expectations
df_ge = ge.from_pandas(df)

# create expectations (failure case)
gender_exp = df_ge.expect_column_values_to_be_in_set(
    "Gender",
    ["Male", "Female"]
)
pprint(gender_exp)
print(df_ge.Gender.value_counts())

# create expectations (success case)
gender_exp = df_ge.expect_column_values_to_be_in_set(
    "Gender",
    ["Male", "Female", "Other"]
)
pprint(gender_exp)

{
  "meta": {},
  "success": false,
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 7,
    "unexpected_percent": 35.0,
    "unexpected_percent_total": 35.0,
    "unexpected_percent_nonmissing": 35.0,
    "partial_unexpected_list": [
      "Other",
      "Other",
      "Other",
      "Other",
      "Other",
      "Other",
      "Other"
    ]
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}
Gender
Male      7
Other     7
Female    6
Name: count, dtype: int64
{
  "meta": {},
  "success": true,
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "e

Data Validation Workflow in GE
(1. install GE
 2. Create data context
 3. Connect to data
 4. Create a validator
 5. Create expectations
 6. Run a checkpoint
 7. View validation results)

In [23]:
# create context
context = ge.get_context()
pprint(context)

# create validator (direct input)
data_path = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_Scripts_Books/Data_Engineering/Data_Engineering_for" \
        "_ML_Pipelines/Own_Files/6_Data_Validation_Great_Expectations" \
            "/data/MockarooHealth.csv"
validator = context.sources.pandas_default.read_csv(data_path)

# create data source
data_source_name = "test2"
ge.datasource = context.sources.add_pandas(data_source_name)

# create data asset
asset_name = "asset1"
asset = ge.datasource.add_csv_asset(
    name=asset_name,
    filepath_or_buffer=data_path)

# build batch request
batch_request = asset.build_batch_request()

# create expectation suite
context.add_or_update_expectation_suite(
    "my_expectation_suite")

# create a validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="my_expectation_suite")
print(validator.head())


{
  "anonymous_usage_statistics": {
    "explicit_id": true,
    "enabled": true,
    "explicit_url": false,
    "data_context_id": "09f188f3-5ed0-43d3-8ae2-2c5b5beebae6",
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics"
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_version": 3,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "/var/folders/pv/ttwy3_md59d5w2n1jcpcn2m80000gp/T/tmp7s63x03i"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {},
  "evaluation_parameter_store_name": "evaluation_parameter_store",
  "expectations_store_name": "expectations_store",
  "fluent_datasources": {},
  "include_rendered_content": {
    "expectation_suite": false,
    "expectation_validation_resu

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

               drug_company  \
0     AMI Cosmetic Co.,Ltd.   
1          Apotheca Company   
2  Blenheim Pharmacal, Inc.   
3  Aurobindo Pharma Limited   
4          Albert Max, Inc.   

                                           drug_name    drug_code  drug_price  \
0                                           Glycerin  356082843-0       21.77   
1  Berberis vulgaris, Hydrangea arborescens, Petr...  641668608-2      333.30   
2                                         Topiramate  225672839-0      630.50   
3                                         Carvedilol  295230280-4      129.39   
4                MENTHOL, METHYL SALYCILATE, BORNEOL  688657119-2      761.70   

  drug_in_stock drug_release_date  
0          True         1/11/2022  
1         False         4/27/2022  
2         False         4/16/2022  
3         False          5/3/2022  
4          True         1/11/2022  


In [26]:
# run validator and run an expectation
expectation_not_null = validator.expect_column_values_to_not_be_null(
    column="drug_name")
pprint(expectation_not_null)

expectation_not_null = validator.expect_column_values_to_not_be_null(
    column="drug_code")
pprint(expectation_not_null)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "meta": {},
  "success": false,
  "result": {
    "element_count": 1000,
    "unexpected_count": 64,
    "unexpected_percent": 6.4,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "meta": {},
  "success": false,
  "result": {
    "element_count": 1000,
    "unexpected_count": 41,
    "unexpected_percent": 4.1000000000000005,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [29]:
# save expectation suit saved to json
save_dir = "/Users/pepijnschouten/Desktop/Python_Scripts/Python_Scripts_Books" \
    "/Data_Engineering/Data_Engineering_for_ML_Pipelines/Own_Files" \
        "/6_Data_Validation_Great_Expectations/saves"
validator.save_expectation_suite(os.path.join(save_dir, "my_expectation_suite.json"),
                                 discard_failed_expectations=False)

Checkpoints (link betweeen expectation suits and data)

In [30]:
# create checkpoint
checkpoint = context.add_or_update_checkpoint(
    name="my_checkpoint",
    validations=
    [
        {
            "batch_request": batch_request,
            "expectation_suite_name": "my_expectation_suite"
        },
    ],
)
checkpoint_result = checkpoint.run()
print(checkpoint_result)

# add to checkpoint
"""
context.add_or_update_checkpoint(
    name="my_test_checkpoint",
    validations=more_validations
)
"""


Calculating Metrics: 0it [00:00, ?it/s]

{
  "run_id": {
    "run_name": null,
    "run_time": "2024-10-29T16:37:26.598715+01:00"
  },
  "run_results": {
    "ValidationResultIdentifier::my_expectation_suite/__none__/20241029T153726.598715Z/test2-asset1": {
      "validation_result": {
        "meta": {
          "great_expectations_version": "0.18.19",
          "expectation_suite_name": "my_expectation_suite",
          "run_id": {
            "run_name": null,
            "run_time": "2024-10-29T16:37:26.598715+01:00"
          },
          "batch_spec": {
            "reader_method": "read_csv",
            "reader_options": {
              "filepath_or_buffer": "/Users/pepijnschouten/Desktop/Python_Scripts/Python_Scripts_Books/Data_Engineering/Data_Engineering_for_ML_Pipelines/Own_Files/6_Data_Validation_Great_Expectations/data/MockarooHealth.csv"
            }
          },
          "batch_markers": {
            "ge_load_time": "20241029T153726.611213Z",
            "pandas_data_fingerprint": "6b2229ed81123eb6e75b51a3b

Data Documentation

In [31]:
# creates website
context.build_data_docs()

{'local_site': 'file:///var/folders/pv/ttwy3_md59d5w2n1jcpcn2m80000gp/T/tmp7s63x03i/index.html'}