### Healthcare – Patient Data Accuracy

**Task 1**: Patient Record Accuracy Assessment

**Objective**: Achieve high accuracy in patient records.

**Steps**:
1. Examine a sample patient dataset for common inaccuracies.
2. Identify at least three common issues, such as medication errors or misdiagnoses.
3. Propose validation measures to ensure data accuracy at the point of entry.

In [None]:
# Write your code from here

**Task 2**: Implement Healthcare Data Quality Checks

**Objective**: Maintain accurate health records within a healthcare system.

**Steps**:
1. Develop a validation workflow for patient data.
2. Use appropriate software to automate checks for common errors.

In [None]:
# Write your code from here


In [None]:
# This script demonstrates how to automate data quality checks using Great Expectations.
# It covers setting up a data context, defining expectations, validating data,
# and generating data quality reports.

# --- Prerequisites ---
# Before running this code, you need to install Great Expectations.
# Open your terminal or command prompt and run:
# pip install great_expectations pandas

import pandas as pd
import great_expectations as ge
from great_expectations.data_context import DataContext
import os
import shutil
from datetime import datetime

# Define a directory for the Great Expectations project.
# This will create a 'great_expectations' subdirectory in your current working directory.
ge_project_dir = "my_ge_project"

# Clean up previous GE project if it exists for a fresh start
if os.path.exists(ge_project_dir):
    print(f"Removing existing Great Expectations project directory: {ge_project_dir}")
    shutil.rmtree(ge_project_dir)

# Initialize a Great Expectations data context.
# This creates the 'great_expectations' directory with configuration files.
print(f"Initializing Great Expectations data context in '{ge_project_dir}'...")
context = ge.data_context.DataContext.create(project_dir=ge_project_dir)
print("Great Expectations data context initialized.")

# --- Previous Tasks (Customer Data) ---

print("\n--- Previous Tasks: Customer Data Validation ---")

# Create a sample customer dataset
print("\nCreating a sample customer dataset...")
data_customer = {
    'customer_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 11], # Duplicate customer_id 1
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Heidi', 'Ivan', 'Judy', 'Alice', 'Kyle'],
    'email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com', 'eve@example.com',
              'frank@example.com', 'grace@example.com', 'heidi@example.com', 'ivan@example.com', 'judy@example.com',
              'alice@example.com', 'kyle@example.com'],
    'age': [25, 30, 35, 40, 28, 32, 29, 45, 22, 38, 25, None], # Missing age for Kyle
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose', 'New York', 'Austin'],
    'registration_date': ['2023-01-15', '2022-03-20', '2023-07-01', '2021-11-10', '2023-02-28',
                          '2022-09-05', '2023-04-12', '2021-06-30', '2023-05-18', '2022-01-01',
                          '2023-01-15', '2023-10-25'],
    'order_count': [5, 12, 8, 20, 3, 15, 7, 25, 2, 10, 5, 6],
    'is_active': [True, True, False, True, True, False, True, True, True, False, True, True]
}
df_customer = pd.DataFrame(data_customer)
print("Customer Data (first 5 rows):")
print(df_customer.head())

# Add customer DataFrame to a Great Expectations Data Source
datasource_name_customer = "customer_data_source"
data_asset_name_customer = "customer_records"

context.add_datasource(
    name=datasource_name_customer,
    class_name="PandasDatasource",
    batch_spec_passthrough={"reader_method": "dataframe"},
)

batch_request_customer = ge.core.batch_request.BatchRequest(
    datasource_name=datasource_name_customer,
    data_asset_name=data_asset_name_customer,
    data_connector_name="default_runtime_data_connector",
    data_connector_query={"batch_filter_parameters": {"batch_data": df_customer}},
)
validator_customer = context.get_validator(
    batch_request=batch_request_customer,
    expectation_suite_name="customer_data_suite"
)

print("\nCreating basic expectations for customer dataset...")
validator_customer.expect_column_to_exist("customer_id")
validator_customer.expect_column_values_to_be_of_type("customer_id", "int")
validator_customer.expect_column_values_to_be_unique("customer_id") # Added unique for customer_id
validator_customer.expect_column_to_exist("name")
validator_customer.expect_column_values_to_be_of_type("name", "str")
validator_customer.expect_column_to_exist("age")
validator_customer.expect_column_values_to_be_of_type("age", "int")
validator_customer.expect_column_values_to_be_between("age", min_value=18, max_value=99, allow_missing=True)
validator_customer.expect_column_to_exist("email")
validator_customer.expect_column_values_to_match_regex("email", r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")
validator_customer.expect_column_to_exist("registration_date")
validator_customer.expect_column_values_to_match_strftime_format("registration_date", "%Y-%m-%d")
validator_customer.expect_column_to_exist("order_count")
validator_customer.expect_column_values_to_be_of_type("order_count", "int")
validator_customer.expect_column_values_to_be_between("order_count", min_value=0, max_value=None)
validator_customer.expect_column_to_exist("is_active")
validator_customer.expect_column_values_to_be_of_type("is_active", "bool")

# Save the customer expectation suite
validator_customer.save_expectation_suite(discard_failed_expectations=False)
print("Customer expectation suite 'customer_data_suite' saved.")

# Run validation for customer data
checkpoint_name_customer = "customer_data_checkpoint"
context.add_checkpoint(
    name=checkpoint_name_customer,
    validator=validator_customer,
    action_list=[
        {"name": "store_validation_result", "action_class": "StoreValidationResultAction"},
        {"name": "store_evaluation_parameter_metrics", "action_class": "StoreEvaluationParametersAction"},
        {"name": "update_data_docs", "action_class": "UpdateDataDocsAction"},
    ],
)
validation_result_customer = context.run_checkpoint(checkpoint_name=checkpoint_name_customer)

print("\nCustomer Data Validation Results Summary:")
if validation_result_customer.success:
    print("Customer data validation successful! All expectations passed.")
else:
    print("Customer data validation failed. Some expectations did not pass.")
    for result in validation_result_customer.results:
        if not result.success:
            print(f"  Failed Expectation: {result.expectation_config.expectation_type} for column '{result.expectation_config.column}'")
            if result.expectation_config.column: # Check if column is in the expectation config
                print(f"    for column '{result.expectation_config.column}'")
            print(f"    Details: {result.result}")

# --- Previous Task: Finance – Ensuring Accurate Transactions ---

print("\n--- Previous Task: Finance – Ensuring Accurate Transactions ---")

# Create a sample financial transaction dataset
print("\nCreating a sample financial transaction dataset...")
data_transactions = {
    'transaction_id': ['T001', 'T002', 'T003', 'T004', 'T005', 'T006', 'T007', 'T001'], # Duplicate T001
    'transaction_date': ['2023-01-01', '2023-01-05', '2023-01-10', '2023-01-15', '2023-01-20', '2023-01-25', '2023-01-30', '2023-01-01'],
    'account_id': ['ACC001', 'ACC002', 'ACC001', 'ACC003', 'ACC002', 'ACC004', 'ACC001', 'ACC001'],
    'transaction_type': ['DEBIT', 'CREDIT', 'DEBIT', 'CREDIT', 'DEBIT', 'TRANSFER', 'DEBIT', 'DEBIT'],
    'amount': [100.50, 250.00, 50.25, 120.00, -30.00, 75.00, 200.00, 100.50], # Negative amount
    'currency': ['USD', 'USD', 'EUR', 'USD', 'USD', 'GBP', 'USD', 'USD'],
    'description': ['Groceries', 'Salary', 'Rent', 'Utilities', 'Refund', 'Online Purchase', None, 'Groceries'] # Missing description
}
df_transactions = pd.DataFrame(data_transactions)
print("Financial Transaction Data (first 5 rows):")
print(df_transactions.head())

# Add transaction DataFrame to a Great Expectations Data Source
datasource_name_transactions = "financial_data_source"
data_asset_name_transactions = "transactions"

context.add_datasource(
    name=datasource_name_transactions,
    class_name="PandasDatasource",
    batch_spec_passthrough={"reader_method": "dataframe"},
)

batch_request_transactions = ge.core.batch_request.BatchRequest(
    datasource_name=datasource_name_transactions,
    data_asset_name=data_asset_name_transactions,
    data_connector_name="default_runtime_data_connector",
    data_connector_query={"batch_filter_parameters": {"batch_data": df_transactions}},
)
validator_transactions = context.get_validator(
    batch_request=batch_request_transactions,
    expectation_suite_name="financial_transactions_suite"
)

print("\nDeveloping validation checks for financial transactions...")
validator_transactions.expect_column_to_exist("transaction_id")
validator_transactions.expect_column_values_to_be_unique("transaction_id")
validator_transactions.expect_column_to_exist("transaction_date")
validator_transactions.expect_column_values_to_match_strftime_format("transaction_date", "%Y-%m-%d")
validator_transactions.expect_column_to_exist("account_id")
validator_transactions.expect_column_values_to_be_of_type("account_id", "str")
validator_transactions.expect_column_values_to_match_regex("account_id", r"^ACC\d{3}$") # e.g., ACC001
validator_transactions.expect_column_to_exist("transaction_type")
validator_transactions.expect_column_values_to_be_in_set("transaction_type", ["DEBIT", "CREDIT", "TRANSFER", "FEE"])
validator_transactions.expect_column_to_exist("amount")
validator_transactions.expect_column_values_to_be_of_type("amount", "float")
validator_transactions.expect_column_values_to_be_between("amount", min_value=0.01, max_value=None) # Amount must be positive
validator_transactions.expect_column_to_exist("currency")
validator_transactions.expect_column_values_to_be_in_set("currency", ["USD", "EUR", "GBP", "JPY"])
validator_transactions.expect_column_to_exist("description")
validator_transactions.expect_column_values_to_not_be_null("description", mostly=0.90)
validator_transactions.expect_compound_columns_to_be_unique(["transaction_id", "transaction_date", "account_id", "amount"])

# Save the financial transaction expectation suite
validator_transactions.save_expectation_suite(discard_failed_expectations=False)
print("Financial transaction expectation suite 'financial_transactions_suite' saved.")

# Run validation for financial data
checkpoint_name_transactions = "financial_transactions_checkpoint"
context.add_checkpoint(
    name=checkpoint_name_transactions,
    validator=validator_transactions,
    action_list=[
        {"name": "store_validation_result", "action_class": "StoreValidationResultAction"},
        {"name": "store_evaluation_parameter_metrics", "action_class": "StoreEvaluationParametersAction"},
        {"name": "update_data_docs", "action_class": "UpdateDataDocsAction"},
    ],
)
validation_result_transactions = context.run_checkpoint(checkpoint_name=checkpoint_name_transactions)

print("\nFinancial Transaction Validation Results Summary:")
if validation_result_transactions.success:
    print("Financial transaction validation successful! All expectations passed.")
else:
    print("Financial transaction validation failed. Some expectations did not pass.")
    for result in validation_result_transactions.results:
        if not result.success:
            print(f"  Failed Expectation: {result.expectation_config.expectation_type} for column '{result.expectation_config.column}'")
            if result.expectation_config.column:
                print(f"    for column '{result.expectation_config.column}'")
            print(f"    Details: {result.result}")


# --- New Task: Healthcare – Patient Data Accuracy ---

print("\n--- New Task: Healthcare – Patient Data Accuracy ---")

# Task 1: Patient Record Accuracy Assessment

print("\n--- Task 1: Patient Record Accuracy Assessment ---")

# 1. Examine a sample patient dataset for common inaccuracies.
print("\nCreating a sample patient dataset...")
data_patients = {
    'patient_id': ['P001', 'P002', 'P003', 'P004', 'P005', 'P001', 'P006'], # Duplicate P001
    'first_name': ['John', 'Jane', 'Peter', 'Mary', 'Alice', 'John', 'Robert'],
    'last_name': ['Doe', 'Smith', 'Jones', 'Brown', 'Williams', 'Doe', 'Johnson'],
    'date_of_birth': ['1980-05-10', '1992-11-23', '1975-01-01', '2000-03-15', '1968-09-02', '1980-05-10', '1995-07-20'],
    'gender': ['Male', 'Female', 'Male', 'Female', 'MALE', 'Male', 'Non-binary'], # Inconsistent 'MALE'
    'diagnosis': ['Flu', 'Common Cold', 'Hypertension', 'Diabetes', 'Arthritis', 'Flu', 'Asthma'],
    'medication': ['Amoxicillin', 'N/A', 'Lisinopril', 'Metformin', 'Ibuprofen', 'Amoxicillin', 'Ventolin'],
    'last_visit_date': ['2024-01-20', '2025-02-15', '2024-03-01', '2025-04-10', '2024-05-05', '2024-01-20', '2025-01-01'],
    'blood_pressure_systolic': [120, 110, 145, 130, 160, 120, 125], # High BP for P005
    'blood_pressure_diastolic': [80, 70, 95, 85, 100, 80, 80],     # High BP for P005
    'allergies': ['Penicillin', None, 'None', 'Sulfur', 'Latex', 'Penicillin', 'Pollen'] # Missing allergies
}
df_patients = pd.DataFrame(data_patients)
print("Patient Data (first 5 rows):")
print(df_patients.head())

# 2. Identify at least three common issues, such as medication errors or misdiagnoses.
print("\nIdentifying common inaccuracies in patient records:")
print(" - Duplicate Patient IDs: 'P001' appears twice, indicating potential duplicate records.")
print(" - Inconsistent 'gender' entries: 'Male' vs 'MALE', and 'Non-binary' which might not be an allowed value.")
print(" - Medication errors/inconsistencies: 'N/A' for medication, and `None` for allergies.")
print(" - Out-of-range values: Blood pressure values might be outside typical healthy ranges (e.g., very high/low).")
print(" - Missing values: 'allergies' column has `None`.")
print(" - Date inconsistencies: 'date_of_birth' and 'last_visit_date' could have future dates or illogical sequences.")

# 3. Propose validation measures to ensure data accuracy at the point of entry.
print("\nProposing validation measures for patient data accuracy at point of entry:")
print(" - **Unique Identifiers**: Ensure `patient_id` is unique upon creation.")
print(" - **Data Type & Format Checks**: Verify dates are valid dates, names are strings, etc.")
print(" - **Lookup/Controlled Vocabularies**: Restrict `gender`, `diagnosis`, `medication` to predefined, valid lists.")
print(" - **Range Checks**: Validate numerical fields like `blood_pressure_systolic/diastolic` within acceptable medical ranges.")
print(" - **Conditional Logic**: If `diagnosis` is 'Diabetes', then `medication` should likely include 'Metformin' or similar, or at least not be 'N/A'.")
print(" - **Completeness**: Ensure critical fields like `first_name`, `last_name`, `date_of_birth` are not null.")
print(" - **Referential Integrity**: If linking to another table (e.g., doctor_id), ensure the foreign key exists.")
print(" - **Historical Consistency**: Ensure `last_visit_date` is not before `date_of_birth` or previous visit dates.")


# Task 2: Implement Healthcare Data Quality Checks

print("\n--- Task 2: Implement Healthcare Data Quality Checks ---")

# 1. Develop a validation workflow for patient data.
# This involves adding the DataFrame to GE and defining an expectation suite.
datasource_name_patients = "patient_data_source"
data_asset_name_patients = "patient_records"

context.add_datasource(
    name=datasource_name_patients,
    class_name="PandasDatasource",
    batch_spec_passthrough={"reader_method": "dataframe"},
)

batch_request_patients = ge.core.batch_request.BatchRequest(
    datasource_name=datasource_name_patients,
    data_asset_name=data_asset_name_patients,
    data_connector_name="default_runtime_data_connector",
    data_connector_query={"batch_filter_parameters": {"batch_data": df_patients}},
)
validator_patients = context.get_validator(
    batch_request=batch_request_patients,
    expectation_suite_name="patient_data_suite"
)

# 2. Use appropriate software to automate checks for common errors. (Using Great Expectations)
print("\nAutomating checks for common patient data errors using Great Expectations...")

# Expectation: `patient_id` is unique and not null
validator_patients.expect_column_to_exist("patient_id")
validator_patients.expect_column_values_to_be_unique("patient_id")
validator_patients.expect_column_values_to_not_be_null("patient_id")

# Expectation: Names are strings and not null
validator_patients.expect_column_to_exist("first_name")
validator_patients.expect_column_values_to_be_of_type("first_name", "str")
validator_patients.expect_column_values_to_not_be_null("first_name")
validator_patients.expect_column_to_exist("last_name")
validator_patients.expect_column_values_to_be_of_type("last_name", "str")
validator_patients.expect_column_values_to_not_be_null("last_name")

# Expectation: `date_of_birth` is a valid date format and not in the future
validator_patients.expect_column_to_exist("date_of_birth")
validator_patients.expect_column_values_to_match_strftime_format("date_of_birth", "%Y-%m-%d")
# Custom expectation for future dates:
# First convert to datetime to compare
df_patients['date_of_birth_dt'] = pd.to_datetime(df_patients['date_of_birth'], errors='coerce')
current_date_ts = pd.Timestamp(datetime.now().strftime('%Y-%m-%d'))
validator_patients.expect_column_values_to_be_between(
    "date_of_birth_dt",
    min_value=pd.Timestamp('1900-01-01'), # A reasonable historical start date
    max_value=current_date_ts,
    parse_strings_as_datetimes=False # Already converted
)

# Expectation: `gender` is from a controlled vocabulary and consistent casing
validator_patients.expect_column_to_exist("gender")
validator_patients.expect_column_values_to_be_in_set("gender", ["Male", "Female", "Other", "Unknown"])
validator_patients.expect_column_values_to_match_regex("gender", r"^(Male|Female|Other|Unknown)$") # For exact matching/casing

# Expectation: `diagnosis` is not null and is a string
validator_patients.expect_column_to_exist("diagnosis")
validator_patients.expect_column_values_to_not_be_null("diagnosis")
validator_patients.expect_column_values_to_be_of_type("diagnosis", "str")

# Expectation: `medication` is not null for a high percentage of records
validator_patients.expect_column_to_exist("medication")
validator_patients.expect_column_values_to_not_be_null("medication", mostly=0.95) # Allow some N/A or None

# Expectation: `last_visit_date` is a valid date and not in the future, and after DOB
validator_patients.expect_column_to_exist("last_visit_date")
validator_patients.expect_column_values_to_match_strftime_format("last_visit_date", "%Y-%m-%d")
# Again, convert to datetime for comparison
df_patients['last_visit_date_dt'] = pd.to_datetime(df_patients['last_visit_date'], errors='coerce')
validator_patients.expect_column_values_to_be_between(
    "last_visit_date_dt",
    min_value=pd.Timestamp('1900-01-01'), # Should be after DOB, but check overall range
    max_value=current_date_ts,
    parse_strings_as_datetimes=False
)
# Expect `last_visit_date` to be after `date_of_birth`
# This requires comparing two columns, which can be done using `expect_column_pair_values_to_be_date_and_time_consistent`
# or by explicitly checking in a custom expectation.
# For simplicity and common GE expectations:
validator_patients.expect_column_A_values_to_be_greater_than_B_values(
    column_A="last_visit_date_dt",
    column_B="date_of_birth_dt",
    or_equal=True, # A visit can theoretically be on the birth date for new borns
    parse_strings_as_datetimes=False
)

# Expectation: Blood pressure values are within reasonable medical ranges
validator_patients.expect_column_to_exist("blood_pressure_systolic")
validator_patients.expect_column_values_to_be_between("blood_pressure_systolic", min_value=70, max_value=200) # Typical ranges
validator_patients.expect_column_to_exist("blood_pressure_diastolic")
validator_patients.expect_column_values_to_be_between("blood_pressure_diastolic", min_value=40, max_value=120) # Typical ranges
# Expect systolic to be greater than diastolic
validator_patients.expect_column_A_values_to_be_greater_than_B_values(
    column_A="blood_pressure_systolic",
    column_B="blood_pressure_diastolic"
)

# Expectation: Allergies column exists and not null for a majority
validator_patients.expect_column_to_exist("allergies")
validator_patients.expect_column_values_to_not_be_null("allergies", mostly=0.80) # Allow some missing

# Save the patient data expectation suite
validator_patients.save_expectation_suite(discard_failed_expectations=False)
print("Patient data expectation suite 'patient_data_suite' saved.")

# Run validation for patient data
checkpoint_name_patients = "patient_data_checkpoint"
context.add_checkpoint(
    name=checkpoint_name_patients,
    validator=validator_patients,
    action_list=[
        {"name": "Store_validation_result", "action_class": "StoreValidationResultAction"},
        {"name": "store_evaluation_parameter_metrics", "action_class": "StoreEvaluationParametersAction"},
        {"name": "update_data_docs", "action_class": "UpdateDataDocsAction"},
    ],
)
validation_result_patients = context.run_checkpoint(checkpoint_name=checkpoint_name_patients)

print("\nPatient Data Validation Results Summary:")
if validation_result_patients.success:
    print("Patient data validation successful! All expectations passed.")
else:
    print("Patient data validation failed. Some expectations did not pass.")
    for result in validation_result_patients.results:
        if not result.success:
            print(f"  Failed Expectation: {result.expectation_config.expectation_type}")
            if result.expectation_config.column:
                print(f"    for column '{result.expectation_config.column}'")
            print(f"    Details: {result.result}")


# Open Data Docs to review all validation reports
print("\nOpening data Docs to review all validation reports...")
context.open_data_docs()
print("Data Docs report generated and opened in your browser (if supported by your environment).")

# --- Conceptual: Automate Periodic Checks (Scheduling) ---

print("\n--- Conceptual: Automating Periodic Checks (Scheduling) ---")
print("Great Expectations validations for customer, financial, and patient data can be automated using various scheduling tools.")
print("This part is conceptual as it involves external tools or frameworks.")

print("\n1. Cron Jobs (Linux/macOS) / Task Scheduler (Windows):")
print("   You can set up a cron job to run this Python script at a specific interval (e.g., daily).")
print("   Example cron entry (runs daily at 2 AM):")
print(f"   0 2 * * * /usr/bin/python3 {os.path.abspath(__file__)}")

print("\n2. Apache Airflow / Prefect / Dagster:")
print("   Define a DAG (Directed Acyclic Graph) in Airflow (or similar workflow in Prefect/Dagster).")
print("   Each task in the DAG could represent a data quality check for a specific dataset.")
print("   Example Airflow DAG snippet:")
print("   from airflow import DAG")
print("   from airflow.operators.python import PythonOperator")
print("   from datetime import datetime")
print("\n   def run_ge_validation(checkpoint_name):")
print("       context = ge.data_context.DataContext(context_root_dir='my_ge_project')")
print("       validation_result = context.run_checkpoint(checkpoint_name=checkpoint_name)")
print("       if not validation_result.success:")
print("           raise ValueError(f'Validation failed for checkpoint {checkpoint_name}')")
print("\n   with DAG('data_quality_checks', start_date=datetime(2023, 1, 1), schedule_interval='@daily') as dag:")
print("       validate_customer_data = PythonOperator(")
print("           task_id='validate_customer_data',")
print("           python_callable=run_ge_validation,")
print("           op_kwargs={'checkpoint_name': 'customer_data_checkpoint'}")
print("       )")
print("       validate_financial_data = PythonOperator(")
print("           task_id='validate_financial_data',")
print("           python_callable=run_ge_validation,")
print("           op_kwargs={'checkpoint_name': 'financial_transactions_checkpoint'}")
print("       )")
print("       validate_patient_data = PythonOperator(")
print("           task_id='validate_patient_data',")
print("           python_callable=run_ge_validation,")
print("           op_kwargs={'checkpoint_name': 'patient_data_checkpoint'}")
print("       )")
print("       # You can define task dependencies here, e.g., validate_customer_data >> validate_financial_data >> validate_patient_data")

print("\n3. CI/CD Pipelines (e.g., Jenkins, GitLab CI, GitHub Actions):")
print("   Integrate Great Expectations validation into your CI/CD pipeline to ensure data quality")
print("   before deploying new data or models to production.")

print("\nTo truly automate, you would integrate this script into a scheduler or orchestrator.")
print("The Data Docs can then be hosted on a web server for easy access to validation reports.")


ImportError: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)