## Defining Data Quality SLAs
### Data Completeness
**Description**: Set an SLA that ensures that 95% of data fields in your dataset are filled (non-null values). Practice by checking a dataset of your choice and calculate its completeness.

In [None]:
import great_expectations as gx
import pandas as pd

# 1. Load your dataset using Pandas
csv_file_path = "data_completeness_check.csv"  # Replace with the path to your dataset
try:
    df = pd.read_csv(csv_file_path)
except FileNotFoundError:
    print(f"Error: File not found at {csv_file_path}")
    exit()

# 2. Create a Great Expectations Data Context (if you don't have one)
context = gx.DataContext()

# 3. Add a Pandas DataFrame Data Source and Data Asset
datasource_name = "completeness_data_source"
datasource = context.sources.add_pandas(name=datasource_name)

data_asset_name = "completeness_data"
data_asset = datasource.add_dataframe_asset(name=data_asset_name)

batch_request = data_asset.build_batch_request(dataframe=df)

# 4. Get a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="data_completeness_suite",  # You can name your suite
)

print(f"Using Expectation Suite: {validator.expectation_suite.name}")

# 5. Define the expectation for data completeness (at least 95% non-null values)
completeness_threshold = 0.95

# We'll use expect_column_values_to_not_be_null for each column
for column in df.columns:
    validator.expect_column_values_to_not_be_null(
        column=column,
        mostly=completeness_threshold,
    )

# 6. Save the Expectation Suite
validator.save_expectation_suite()

# 7. Run the validation using a Checkpoint
checkpoint_name = "data_completeness_checkpoint"
checkpoint_result = context.run_checkpoint(
    checkpoint_name=checkpoint_name,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": "data_completeness_suite",
        }
    ],
)

# 8. Review the validation results to see if the SLA is met
print("\nValidation Results against SLA (95% completeness per field):")
validation_result = checkpoint_result.list_validation_results()[0]
sla_met = True
for expectation_result in validation_result["results"]:
    if expectation_result["expectation_config"]["expectation_type"] == "expect_column_values_to_not_be_null":
        column_name = expectation_result["expectation_config"]["kwargs"]["column"]
        success = expectation_result["success"]
        mostly = expectation_result["expectation_config"]["kwargs"].get("mostly", 1.0)
        print(f"Column '{column_name}': SLA Met = {success} (Expected >= {mostly*100:.2f}%)")
        if not success:
            sla_met = False

if sla_met:
    print("\nData Completeness SLA (95% non-null per field) was MET for all fields.")
else:
    print("\nData Completeness SLA (95% non-null per field) was NOT MET for one or more fields.")

# 9. Calculate and print the overall data completeness
total_cells = df.size
non_null_cells = df.count().sum()
overall_completeness = (non_null_cells / total_cells) if total_cells > 0 else 0.0
print(f"\nOverall Data Completeness: {overall_completeness * 100:.2f}%")

# 10. Optionally, view the detailed report in Data Docs
print("\nTo view the detailed validation report in Data Docs:")
print(f"- Navigate to your Great Expectations Data Context directory.")
print("- Run the command: `great_expectations docs build`")
print("- Open the generated `index.html` file and find the results for the '{checkpoint_name}' Checkpoint and the 'data_completeness_suite' Expectation Suite.")

Error: File not found at data_completeness_check.csv


AttributeError: module 'great_expectations' has no attribute 'DataContext'

: 

### Data Timeliness:
**Description**: Establish an SLA that specifies that data should be integrated and processed within 24 hours of acquisition. Monitor the data pipeline for timeliness.

In [None]:
import great_expectations as gx
import pandas as pd
from datetime import datetime, timedelta

# 1. Simulate fetching pipeline metadata (replace with your actual metadata retrieval)
pipeline_metadata = [
    {"data_id": "batch_1", "acquisition_time": "2025-05-15 10:00:00", "processing_end_time": "2025-05-15 18:00:00"},
    {"data_id": "batch_2", "acquisition_time": "2025-05-15 12:30:00", "processing_end_time": "2025-05-16 11:00:00"},
    {"data_id": "batch_3", "acquisition_time": "2025-05-16 09:00:00", "processing_end_time": "2025-05-17 10:00:00"},
]

df = pd.DataFrame(pipeline_metadata)

# Convert timestamp strings to datetime objects
df['acquisition_time'] = pd.to_datetime(df['acquisition_time'])
df['processing_end_time'] = pd.to_datetime(df['processing_end_time'])

# Calculate the processing duration
df['processing_duration'] = df['processing_end_time'] - df['acquisition_time']

# Define the SLA threshold (24 hours)
sla_threshold = timedelta(hours=24)

# 2. Create a Great Expectations Data Context
context = gx.DataContext()

# 3. Add a Pandas DataFrame Data Source and Data Asset
datasource_name = "pipeline_metadata_source"
datasource = context.sources.add_pandas(name=datasource_name)

data_asset_name = "pipeline_runs"
data_asset = datasource.add_dataframe_asset(name=data_asset_name)

batch_request = data_asset.build_batch_request(dataframe=df)

# 4. Get a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="data_timeliness_suite",
)

print(f"Using Expectation Suite: {validator.expectation_suite.name}")

# 5. Define the expectation to check if processing duration is within the SLA
validator.expect_column_values_to_be_less_than_or_equal_to(
    column="processing_duration",
    value=sla_threshold,
    mostly=1.0,  # Expect all batches to meet the SLA
)

# 6. Save the Expectation Suite
validator.save_expectation_suite()

# 7. Run the validation using a Checkpoint
checkpoint_name = "data_timeliness_checkpoint"
checkpoint_result = context.run_checkpoint(
    checkpoint_name=checkpoint_name,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": "data_timeliness_suite",
        }
    ],
)

# 8. Review the validation results
print("\nValidation Results for Data Timeliness SLA (<= 24 hours):")
validation_result = checkpoint_result.list_validation_results()[0]
timeliness_sla_met = True
for expectation_result in validation_result["results"]:
    if expectation_result["expectation_config"]["expectation_type"] == "expect_column_values_to_be_less_than_or_equal_to" and expectation_result["expectation_config"]["kwargs"]["column"] == "processing_duration":
        success = expectation_result["success"]
        print(f"Timeliness SLA Met for all batches: {success}")
        if not success:
            timeliness_sla_met = False
            if "partial_unexpected_list" in expectation_result["result"]:
                print(f"  - Batches exceeding SLA: {expectation_result['result']['partial_unexpected_list']}")

if timeliness_sla_met:
    print("\nData Timeliness SLA (processing within 24 hours) was MET for all monitored batches.")
else:
    print("\nData Timeliness SLA (processing within 24 hours) was NOT MET for one or more monitored batches.")

# 9. Optionally, view the detailed report in Data Docs
print("\nTo view the detailed validation report in Data Docs:")
print(f"- Navigate to your Great Expectations Data Context directory.")
print("- Run the command: `great_expectations docs build`")
print("- Open the generated `index.html` file and find the results for the '{checkpoint_name}' Checkpoint and the 'data_timeliness_suite' Expectation Suite.")

AttributeError: module 'great_expectations' has no attribute 'DataContext'

### Data Consistency:
**Description**: Define an SLA for maintaining consistency across various related datasets. Implement a check to ensure that 99% of data entries are consistent.

In [None]:
import great_expectations as gx
import pandas as pd

# 1. Simulate two related datasets (replace with your actual data loading)
data1 = [
    {"CustomerID": 1, "ProductName": "Laptop", "OrderDate": "2025-05-10", "Status_A": "Shipped"},
    {"CustomerID": 2, "ProductName": "Mouse", "OrderDate": "2025-05-11", "Status_A": "Delivered"},
    {"CustomerID": 3, "ProductName": "Keyboard", "OrderDate": "2025-05-12", "Status_A": "Shipped"},
    {"CustomerID": 4, "ProductName": "Monitor", "OrderDate": "2025-05-13", "Status_A": "Pending"},
    {"CustomerID": 5, "ProductName": "Webcam", "OrderDate": "2025-05-14", "Status_A": "Shipped"},
]
df1 = pd.DataFrame(data1)

data2 = [
    {"CustomerID": 1, "Product": "Laptop", "DeliveryDate": "2025-05-12", "Status_B": "Shipped"},
    {"CustomerID": 2, "Product": "Mouse", "DeliveryDate": "2025-05-13", "Status_B": "Delivered"},
    {"CustomerID": 3, "Product": "Keyboard", "DeliveryDate": "2025-05-12", "Status_B": "Processing"},
    {"CustomerID": 4, "Product": "Monitor", "DeliveryDate": "2025-05-14", "Status_B": "Pending"},
    {"CustomerID": 6, "Product": "Tablet", "DeliveryDate": "2025-05-15", "Status_B": "Shipped"},
]
df2 = pd.DataFrame(data2)

# 2. Create a Great Expectations Data Context
context = gx.DataContext()

# 3. Add Pandas DataFrame Data Sources and Data Assets for both DataFrames
datasource_name = "consistency_data_source"
datasource = context.sources.add_pandas(name=datasource_name)

data_asset_name_1 = "dataset_1"
data_asset_1 = datasource.add_dataframe_asset(name=data_asset_name_1)
batch_request_1 = data_asset_1.build_batch_request(dataframe=df1)

data_asset_name_2 = "dataset_2"
data_asset_2 = datasource.add_dataframe_asset(name=data_asset_name_2)
batch_request_2 = data_asset_2.build_batch_request(dataframe=df2)

# 4. Get Validators for both Data Assets
validator_1 = context.get_validator(
    batch_request=batch_request_1,
    expectation_suite_name="consistency_suite_dataset_1",  # You can have separate suites or one
)

validator_2 = context.get_validator(
    batch_request=batch_request_2,
    expectation_suite_name="consistency_suite_dataset_2",
)

print(f"Using Expectation Suite 1: {validator_1.expectation_suite.name}")
print(f"Using Expectation Suite 2: {validator_2.expectation_suite.name}")

# 5. Implement checks for consistency (example: status consistency where CustomerID exists in both)
merged_df = pd.merge(df1, df2, on="CustomerID", how="inner", suffixes=("_A", "_B"))

if not merged_df.empty:
    consistency_threshold = 0.99
    consistent_status_count = (merged_df["Status_A"] == merged_df["Status_B"]).sum()
    total_common_records = len(merged_df)
    consistency_rate = consistent_status_count / total_common_records if total_common_records > 0 else 1.0

    print(f"\nConsistency Check for 'Status' across common CustomerIDs:")
    print(f"Consistent Status Count: {consistent_status_count}")
    print(f"Total Common Records: {total_common_records}")
    print(f"Consistency Rate: {consistency_rate * 100:.2f}%")

    # You could add an expectation here on the calculated consistency rate
    validator_1.expect_value_to_be_greater_than_or_equal_to(
        value=consistency_rate,
        mostly=1.0 - (1.0 - consistency_threshold),  # Allow for the inverse of the inconsistency rate
        expectation_type="custom_status_consistency",  # Custom expectation type for reporting
        meta={"notes": f"SLA for status consistency: >= {consistency_threshold * 100:.2f}%"},
    )
    validator_1.save_expectation_suite()

else:
    print("\nNo common CustomerIDs found between the datasets to check status consistency.")

# 6. You might also want to check for the presence of keys across datasets
# Example: Ensure all CustomerIDs in df1 also exist in df2 (or vice-versa, or a significant percentage)
customer_ids_1 = set(df1["CustomerID"])
customer_ids_2 = set(df2["CustomerID"])

present_in_1_not_in_2 = customer_ids_1 - customer_ids_2
present_in_2_not_in_1 = customer_ids_2 - customer_ids_1

print(f"\nCustomer IDs present only in Dataset 1: {present_in_1_not_in_2}")
print(f"Customer IDs present only in Dataset 2: {present_in_2_not_in_1}")

# You could add expectations here to check for the acceptable number of missing keys
validator_1.expect_column_values_to_be_in_set(
    column="CustomerID",
    value_set=list(customer_ids_2),
    mostly=0.99,  # Expect at least 99% of CustomerIDs from df1 to be in df2
    expectation_type="customer_id_overlap_1_to_2",
    meta={"notes": "SLA: At least 99% of CustomerIDs from Dataset 1 should be in Dataset 2"},
)
validator_1.save_expectation_suite()

# 7. Run validation (using validator_1 as it contains the consistency checks)
checkpoint_name = "data_consistency_checkpoint"
checkpoint_result = context.run_checkpoint(
    checkpoint_name=checkpoint_name,
    validations=[
        {
            "batch_request": batch_request_1,
            "expectation_suite_name": "consistency_suite_dataset_1",
        }
    ],
)

# 8. Review the validation results
print("\nValidation Results for Data Consistency:")
validation_result = checkpoint_result.list_validation_results()[0]
for expectation_result in validation_result["results"]:
    print(f"Expectation: {expectation_result['expectation_config']['expectation_type']}, Success: {expectation_result['success']}, Details: {expectation_result['result']}")

# 9. Optionally, view the detailed report in Data Docs
print("\nTo view the detailed validation report in Data Docs:")
print(f"- Navigate to your Great Expectations Data Context directory.")
print("- Run the command: `great_expectations docs build`")
print("- Open the generated `index.html` file and find the results for the '{checkpoint_name}' Checkpoint and the 'consistency_suite_dataset_1' Expectation Suite.")