## Real-World Case Studies

### Healthcare - Medical Prediction Errors:
**Description**: Implement validation rules using a healthcare dataset to reduce errors in
predictive models by automating data quality checks.

In [1]:
import great_expectations as gx
import pandas as pd
import numpy as np

# Initialize Great Expectations Data Context
context = gx.DataContext()

# --- Finance - Fraud Detection Models: Data Quality ---
print("\n--- Finance - Fraud Detection Models: Data Quality ---")

# 1. Simulate a financial dataset (replace with your actual data loading)
data = {
    'transaction_id': range(1, 101),
    'user_id': np.random.randint(100, 500, 100),
    'transaction_amount': np.random.uniform(10, 1000, 100),
    'transaction_time': pd.to_datetime('2025-05-16') + pd.to_timedelta(np.random.randint(0, 86400, 100), unit='s'),
    'ip_address': [f'192.168.1.{np.random.randint(1, 255)}' for _ in range(100)],
    'device_type': np.random.choice(['mobile', 'web', 'tablet', None], size=100, p=[0.4, 0.5, 0.05, 0.05]),
    'location': np.random.choice(['Bengaluru', 'Mumbai', 'Delhi', None], size=100, p=[0.6, 0.3, 0.08, 0.02]),
    'is_fraud': np.random.choice([0, 1], size=100, p=[0.95, 0.05]),
    'transaction_type': np.random.choice(['deposit', 'withdrawal', 'transfer', None], size=100, p=[0.3, 0.4, 0.2, 0.1]),
}
fraud_df = pd.DataFrame(data)

# Introduce some data quality issues for demonstration
fraud_df.loc[np.random.choice(fraud_df.index, size=5, replace=False), 'ip_address'] = None
fraud_df.loc[np.random.choice(fraud_df.index, size=3, replace=False), 'transaction_amount'] = np.nan
fraud_df.loc[np.random.choice(fraud_df.index, size=2, replace=False), 'user_id'] = -1 # Invalid user ID

# 2. Add Pandas DataFrame Data Source and Data Asset
datasource_name = "fraud_data_source"
datasource = context.sources.add_pandas(name=datasource_name)
data_asset_name = "fraud_transactions"
data_asset = datasource.add_dataframe_asset(name=data_asset_name)
batch_request = data_asset.build_batch_request(dataframe=fraud_df)

# 3. Get a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="fraud_data_quality_suite",
)

print(f"Using Expectation Suite: {validator.expectation_suite.name}")

# 4. Define SLAs for Data Accuracy and Completeness

# Completeness SLA:
completeness_threshold = 0.98  # Expect at least 98% non-null values for key fields
for col in ['user_id', 'transaction_amount', 'transaction_time', 'ip_address', 'device_type', 'location', 'transaction_type']:
    validator.expect_column_values_to_not_be_null(column=col, mostly=completeness_threshold)

# Accuracy SLA:
# Expect transaction amounts to be within a reasonable range
validator.expect_column_values_to_be_between(
    column='transaction_amount',
    min_value=1,
    max_value=10000,
    mostly=0.99,
    meta={"notes": "SLA: 99% of transaction amounts should be within a reasonable range"},
)

# Expect user IDs to be positive (assuming -1 is an invalid value)
validator.expect_column_values_to_be_greater_than(
    column='user_id',
    value=0,
    mostly=1.0,
    meta={"notes": "SLA: All user IDs should be positive"},
)

# Expect 'is_fraud' to only have 0 or 1
validator.expect_column_values_to_be_in_set(
    column='is_fraud',
    value_set=[0, 1],
    mostly=1.0,
    meta={"notes": "SLA: 'is_fraud' should only contain 0 or 1"},
)

# Expect IP addresses to follow a basic pattern (very basic check)
validator.expect_column_values_to_match_regex(
    column='ip_address',
    regex=r'^(\d{1,3}\.){3}\d{1,3}$',
    mostly=0.95, # Allow for some missing values
    meta={"notes": "SLA: 95% of IP addresses should match a basic IPv4 pattern"},
)

# Expect transaction type to be in a predefined set
allowed_transaction_types = ['deposit', 'withdrawal', 'transfer']
validator.expect_column_values_to_be_in_set(
    column='transaction_type',
    value_set=allowed_transaction_types,
    mostly=0.90, # Allow for some missing or other types
    meta={"notes": "SLA: 90% of transaction types should be in the allowed set"},
)

# 5. Save the Expectation Suite
validator.save_expectation_suite()

# 6. Run the validation using a Checkpoint
checkpoint_name = "fraud_data_quality_checkpoint"
checkpoint_result = context.run_checkpoint(
    checkpoint_name=checkpoint_name,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": "fraud_data_quality_suite",
        }
    ],
)

# 7. Review the validation results
print("\nFraud Detection Data Quality Validation Results:")
for result in checkpoint_result.list_validation_results():
    print(f"Expectation: {result['expectation_config']['expectation_type']}, Success: {result['success']}, Details: {result['result']}")

# 8. Optionally, view the detailed report in Data Docs
print("\nTo view the detailed validation report in Data Docs:")
print(f"- Navigate to your Great Expectations Data Context directory.")
print("- Run the command: `great_expectations docs build`")
print("- Open the generated `index.html` file and find the results for the '{checkpoint_name}' Checkpoint and the 'fraud_data_quality_suite' Expectation Suite.")

AttributeError: module 'great_expectations' has no attribute 'DataContext'