### Implementing Basic Data Validation with Great Expectations
**Description**: Set up a simple data validation using Great Expectations to check the completeness of a dataset.

**Steps**:
1. Installation
2. Initialize Great Expectations
3. Create a Data Context in Python
4. Create an Expectation Suite
5. Load Sample Data and Validate Completeness
6. Run Validations

In [None]:
# write your code from here


In [1]:
pip install great_expectations


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import get_context
from great_expectations.core.batch import BatchRequest
from great_expectations.exceptions import GreatExpectationsError

def load_sample_data():
    """Simulate loading healthcare data with potential issues."""
    return pd.DataFrame({
        "patient_id": [1, 2, 3, 4, 5],
        "age": [25, 38, None, 45, 60],
        "cholesterol": [180, 190, 210, None, 230],
        "diagnosis": ["diabetes", "none", "hypertension", "diabetes", None]
    })

def setup_context():
    """Initialize in-memory Great Expectations context."""
    try:
        context = get_context()
        return context
    except GreatExpectationsError as e:
        print(f"Error initializing context: {e}")
        return None

def add_datasource(context, datasource_name="pandas_datasource"):
    """Configure in-memory Pandas datasource."""
    try:
        context.add_datasource(
            name=datasource_name,
            class_name="Datasource",
            execution_engine={"class_name": "PandasExecutionEngine"},
            data_connectors={
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"],
                }
            },
        )
    except Exception as e:
        print(f"Error adding datasource: {e}")

def create_expectation_suite(context, suite_name="completeness_suite"):
    """Create or update an expectation suite."""
    try:
        context.add_or_update_expectation_suite(expectation_suite_name=suite_name)
        return suite_name
    except Exception as e:
        print(f"Error creating expectation suite: {e}")
        return None

def validate_data(context, suite_name, data):
    """Run completeness validation on provided data."""
    try:
        batch_request = BatchRequest(
            datasource_name="pandas_datasource",
            data_connector_name="default_runtime_data_connector_name",
            data_asset_name="healthcare_data",
            runtime_parameters={"batch_data": data},
            batch_identifiers={"default_identifier_name": "default_id"},
        )

        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite_name=suite_name,
        )

        # Add expectations
        for col in data.columns:
            validator.expect_column_values_to_not_be_null(col)

        validator.save_expectation_suite(discard_failed_expectations=False)

        # Run validations
        results = context.run_validation_operator(
            "action_list_operator", assets_to_validate=[validator]
        )

        return results
    except Exception as e:
        print(f"Validation error: {e}")
        return None

def main():
    data = load_sample_data()
    context = setup_context()
    if context:
        add_datasource(context)
        suite_name = create_expectation_suite(context)
        if suite_name:
            results = validate_data(context, suite_name, data)
            if results:
                import json
                print(json.dumps(results["run_results"], indent=2, default=str))
            else:
                print("Validation failed.")
        else:
            print("Failed to create expectation suite.")
    else:
        print("Failed to initialize Great Expectations context.")

if __name__ == "__main__":
    main()


Error adding datasource: Datasource is not a FluentDatasource
Error creating expectation suite: 'EphemeralDataContext' object has no attribute 'add_or_update_expectation_suite'
Failed to create expectation suite.


import unittest

class TestExpectations(unittest.TestCase):
    def test_all_columns_present(self):
        df = load_sample_data()
        self.assertIn("patient_id", df.columns)
        self.assertIn("age", df.columns)

    def test_null_values(self):
        df = load_sample_data()
        self.assertTrue(df["age"].isnull().any())
        self.assertTrue(df["diagnosis"].isnull().any())

if __name__ == "__main__":
    unittest.main(argv=[''], verbosity=2, exit=False)
