### Task 1: Validate Data with a Custom Expectation in Great Expectations
**Description**: Create a custom expectation and validate data with Great Expectations.

**Load a sample DataFrame**

data = {
'age': [25, 30, 35, 40, 45],
'income': [50000, 60000, 75000, None, 100000]
}

In [5]:
# Write your code from here

### Task 2: Implement a Basic Alert System for Data Quality Drops
**Description**: Set up a basic alert system that triggers when data quality drops.

In [6]:
# Write your code from here

### Task 3: Real-time Data Quality Monitoring with Python and Great Expectations
**Description**: Implement a system that monitors data quality in real-time.

In [7]:
pip install great_expectations

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import great_expectations as gx

# Load a sample DataFrame
data = {'age': [25, 30, 35, 40, 45], 'income': [50000, 60000, 75000, None, 100000]}
df = pd.DataFrame(data)

# Create a Great Expectations DataContext (if you haven't initialized one)
context = gx.get_context()

# Convert the Pandas DataFrame to a Great Expectations Batch
batch = gx.dataset.PandasDataset(df, batch_kwargs={"datasource": "pandas"})

# --- Define a Custom Expectation ---
# Let's create an expectation that checks if all ages are greater than 18.

@gx.core.expectation.column_map_expectation
class ExpectColumnValuesGreaterThanEighteen(gx.core.expectation.ColumnMapExpectation):
    """Expect each entry in a column to be greater than 18."""

    map_fn = lambda x: x > 18
    success_keys = ("mostly",)

    default_kwarg_values = {
        "mostly": 1.0,
    }

    library_metadata = {
        "maturity": "experimental",
        "package": "custom_expectations",
        "tags": ["core", "column", "map"],
        "contributors": ["You"],
    }

# Register the custom expectation with the DataContext
context.add_expectation(ExpectColumnValuesGreaterThanEighteen)

# --- Create an Expectation Suite ---
expectation_suite_name = "custom_age_expectations"
try:
    expectation_suite = context.get_expectation_suite(expectation_suite_name)
    print(f"Loaded existing Expectation Suite: {expectation_suite_name}")
except gx.exceptions.ExpectationSuiteNotFoundError:
    expectation_suite = context.create_expectation_suite(
        expectation_suite_name=expectation_suite_name, overwrite_existing=True
    )
    print(f"Created Expectation Suite: {expectation_suite_name}")

# Add the custom expectation to the suite
expectation_suite.add_expectation(
    {
        "expectation_type": "expect_column_values_greater_than_eighteen",
        "column": "age",
    }
)

# Add a built-in expectation for income not to have missing values (mostly)
expectation_suite.add_expectation(
    {
        "expectation_type": "expect_column_values_to_not_be_null",
        "column": "income",
        "mostly": 0.8,  # Allow up to 20% missing values
    }
)

# Save the Expectation Suite
context.save_expectation_suite(expectation_suite)

# --- Validate the Data ---
results = batch.validate(expectation_suite=expectation_suite)

# Print the validation results
print("\nValidation Results:")
print(results)

# You can also access specific validation results:
for result in results["results"]:
    if not result["success"]:
        print(f"\nValidation failed for expectation: {result['expectation_config']['expectation_type']} on column '{result['expectation_config']['column']}'")
        print(f"Details: {result['result']}")

AttributeError: module 'great_expectations' has no attribute 'dataset'

In [None]:
# --- Task 2: Implement a Basic Alert System for Data Quality Drops ---

def basic_data_quality_alert(validation_results):
    """
    Checks validation results and prints an alert if any expectation failed.

    Args:
        validation_results (dict): The dictionary returned by batch.validate().
    """
    if not validation_results["success"]:
        print("\n--- DATA QUALITY ALERT! ---")
        for result in validation_results["results"]:
            if not result["success"]:
                expectation_type = result["expectation_config"].get("expectation_type", "Unknown Expectation")
                column = result["expectation_config"].get("column", "N/A")
                print(f"  Failure: Expectation '{expectation_type}' failed on column '{column}'.")
                print(f"    Details: {result['result']}")
        print("--- Please investigate the data quality issues. ---")
    else:
        print("\nData quality checks passed successfully.")

# Run the alert system based on the validation results from Task 1
basic_data_quality_alert(results)

# --- Enhanced Alert System (Example with Threshold) ---
def enhanced_data_quality_alert(validation_results, failure_threshold=1):
    """
    Checks validation results and triggers an alert if the number of failed
    expectations exceeds a threshold.

    Args:
        validation_results (dict): The dictionary returned by batch.validate().
        failure_threshold (int): The maximum number of failed expectations allowed.
                                 Defaults to 1.
    """
    failed_count = sum(1 for result in validation_results["results"] if not result["success"])
    if failed_count > failure_threshold:
        print(f"\n--- DATA QUALITY ALERT! ({failed_count} failures) ---")
        for result in validation_results["results"]:
            if not result["success"]:
                expectation_type = result["expectation_config"].get("expectation_type", "Unknown Expectation")
                column = result["expectation_config"].get("column", "N/A")
                print(f"  Failure: Expectation '{expectation_type}' failed on column '{column}'.")
                print(f"    Details: {result['result']}")
        print("--- Data quality has dropped below the acceptable threshold. ---")
        # You could add more sophisticated alerting here (e.g., send an email)
    else:
        print(f"\nData quality check passed with {failed_count} failures (threshold: {failure_threshold}).")

# Example of using the enhanced alert system
enhanced_data_quality_alert(results, failure_threshold=0) # Trigger if any failure occurs

In [None]:
import time
import random
from datetime import datetime

# --- Task 3: Real-time Data Quality Monitoring with Python and Great Expectations ---

def simulate_real_time_data_stream():
    """Simulates a stream of incoming data."""
    while True:
        new_data = {'age': [random.randint(20, 50)], 'income': [random.randint(40000, 120000) if random.random() > 0.1 else None]}
        new_df = pd.DataFrame(new_data)
        yield new_df
        time.sleep(5)  # Simulate a 5-second interval

def monitor_data_quality(data_stream, context, expectation_suite_name):
    """
    Monitors incoming data, validates it against the expectation suite,
    and triggers alerts.

    Args:
        data_stream (generator): A generator yielding incoming pandas DataFrames.
        context (gx.DataContext): The Great Expectations DataContext.
        expectation_suite_name (str): The name of the Expectation Suite to use.
    """
    try:
        expectation_suite = context.get_expectation_suite(expectation_suite_name)
    except gx.exceptions.ExpectationSuiteNotFoundError:
        print(f"Error: Expectation Suite '{expectation_suite_name}' not found.")
        return

    print("\n--- Real-time Data Quality Monitoring Started ---")
    for i, new_df in enumerate(data_stream):
        batch = gx.dataset.PandasDataset(new_df, batch_kwargs={"datasource": "realtime_stream"})
        validation_results = batch.validate(expectation_suite=expectation_suite)

        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"\n[{timestamp}] Validating new data batch {i+1}:")
        print(validation_results["success"])
        basic_data_quality_alert(validation_results) # Use the alert system

        # In a real system, you might log these results to a monitoring dashboard

if __name__ == "__main__":
    # Ensure you have a DataContext initialized and the 'custom_age_expectations' suite saved
    context = gx.get_context()
    expectation_suite_name = "custom_age_expectations"

    # Simulate a real-time data stream
    data_stream = simulate_real_time_data_stream()

    # Start the monitoring process
    monitor_data_quality(data_stream, context, expectation_suite_name)