In [3]:
# Question: Data Quality Automation Tools - Introduction to Great Expectations
# Description: Set up a simple Great Expectations check for missing values in a numeric column.



In [4]:
pip install great-expectations

Collecting great-expectations
  Downloading great_expectations-1.4.3-py3-none-any.whl.metadata (8.8 kB)
Collecting altair<5.0.0,>=4.2.1 (from great-expectations)
  Downloading altair-4.2.2-py3-none-any.whl.metadata (13 kB)
Collecting cryptography>=3.2 (from great-expectations)
  Downloading cryptography-44.0.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (5.7 kB)
Collecting jinja2>=3 (from great-expectations)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting jsonschema>=2.5.1 (from great-expectations)
  Downloading jsonschema-4.23.0-py3-none-any.whl.metadata (7.9 kB)
Collecting marshmallow<4.0.0,>=3.7.1 (from great-expectations)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mistune>=0.8.4 (from great-expectations)
  Downloading mistune-3.1.3-py3-none-any.whl.metadata (1.8 kB)
Collecting posthog<4,>3 (from great-expectations)
  Downloading posthog-3.25.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting pydantic>=1.10.7 (from great

In [5]:
import pandas as pd
import great_expectations as gx

# 1. Load your data (replace with your actual data loading)
data = {'col1': [1, 2, None, 4, 5],
        'col2': ['a', 'b', 'c', 'd', 'e']}
df = pd.DataFrame(data)

# 2. Get a Data Context
context = gx.get_context()

# 3. Create a Batch of Data
# Assuming you've set up a Pandas DataFrame Data Connector named 'my_pandas_connector'
# and a Data Source named 'my_pandas_datasource' during initialization.
# You might need to adjust these names based on your setup.
batch_kwargs = {
    "datasource": "my_pandas_datasource",
    "data_connector": "my_pandas_connector",
    "data_asset_name": "my_dataframe",  # A name for your DataFrame within GE
}
batch = context.get_batch(data=df, batch_kwargs=batch_kwargs)

# Alternatively, if you haven't explicitly set up a Data Connector for in-memory DataFrames,
# you can create a Batch directly:
# batch = gx.dataset.PandasDataset(df, profiler_configuration=None)

# 4. Get or Create an Expectation Suite
expectation_suite_name = "missing_value_check_suite"
try:
    suite = context.get_expectation_suite(expectation_suite_name)
    print(f"Loaded existing Expectation Suite: {expectation_suite_name}")
except gx.exceptions.ExpectationSuiteNotFoundError:
    suite = context.create_expectation_suite(expectation_suite_name)
    print(f"Created new Expectation Suite: {expectation_suite_name}")

# 5. Add an Expectation for Missing Values
column_to_check = "col1"
suite.expect_column_values_to_not_be_null(column=column_to_check)

# You can also specify a maximum percentage of allowed nulls:
# suite.expect_column_values_to_be_null(column=column_to_check, mostly=0.2) # Allow up to 20% nulls

# 6. Save the Expectation Suite
context.save_expectation_suite(suite)

# 7. Create a Validation Result
results = context.run_validation_operator(
    "default_validation_operator",
    assets_to_validate=[batch],
    expectation_suite_name=expectation_suite_name,
)

# 8. Review the Validation Results
validation_result = results.list_validation_results()[0]
if validation_result["success"]:
    print(f"\nValidation Successful! No unexpected missing values found in '{column_to_check}'.")
else:
    print(f"\nValidation Failed! Unexpected missing values found in '{column_to_check}':")
    for result in validation_result["results"]:
        if not result["success"]:
            print(f"  - {result['expectation_config']['expectation_type']}: {result['result']}")

# 9. (Optional) Build Data Docs to visualize the results
context.build_data_docs()
print("\nData Docs have been built. Check the 'great_expectations/uncommitted/data_docs/local_site/index.html' file.")

ModuleNotFoundError: No module named 'numpy.strings'