In [None]:
# Question: Data Quality Automation Tools - Introduction to Great Expectations
# Description: Set up a simple Great Expectations check for missing values in a numeric column.



In [2]:
pip install great_expectations


Defaulting to user installation because normal site-packages is not writeable
Collecting great_expectations
  Downloading great_expectations-1.4.4-py3-none-any.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting jinja2>=3
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.9/134.9 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jsonschema>=2.5.1
  Downloading jsonschema-4.23.0-py3-none-any.whl (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.5/88.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mistune>=0.8.4
  Downloading mistune-3.1.3-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting posthog<4,>3
  Downloading posthog-3.25.0-py2.py3-none-any.w

In [4]:
import os
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import get_context

# Step 1: Set up directory
project_dir = "my_gx_project"
os.makedirs(project_dir, exist_ok=True)
os.chdir(project_dir)

# Step 2: Initialize Great Expectations programmatically
context = ge.get_context()

# Step 3: Create sample CSV data
df = pd.DataFrame({
    "id": [1, 2, 3, 4],
    "value": [10, 15, None, 20]
})
csv_path = os.path.abspath("sample_data.csv")
df.to_csv(csv_path, index=False)

# Step 4: Create Expectation Suite
suite_name = "missing_values_suite"
context.create_expectation_suite(expectation_suite_name=suite_name, overwrite_existing=True)

# Step 5: Define a BatchRequest for this CSV file
batch_request = {
    "datasource_name": "my_datasource",
    "data_connector_name": "default_runtime_data_connector_name",
    "data_asset_name": "sample_data_asset",
    "runtime_parameters": {"path": csv_path},
    "batch_identifiers": {"default_identifier_name": "default_id"}
}

# Register a simple Pandas-based datasource
context.add_datasource(
    name="my_datasource",
    class_name="Datasource",
    execution_engine={"class_name": "PandasExecutionEngine"},
    data_connectors={
        "default_runtime_data_connector_name": {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["default_identifier_name"]
        }
    }
)

# Step 6: Create a Validator and add expectation
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name
)

# Expect no missing values in 'value' column
validator.expect_column_values_to_not_be_null("value")

# Save expectation suite
validator.save_expectation_suite(discard_failed_expectations=False)

# Step 7: Run a validation
checkpoint_name = "missing_value_checkpoint"
checkpoint_config = {
    "name": checkpoint_name,
    "config_version": 1.0,
    "class_name": "SimpleCheckpoint",
    "validations": [{"batch_request": batch_request, "expectation_suite_name": suite_name}],
}
context.add_or_update_checkpoint(**checkpoint_config)

results = context.run_checkpoint(checkpoint_name=checkpoint_name)

# Step 8: Output validation result summary
validation_result = results["run_results"]
for run_key, result in validation_result.items():
    print("Validation Success:", result["validation_result"]["success"])
    print("Expectations Met:", result["validation_result"]["statistics"])


ModuleNotFoundError: No module named 'numpy.rec'