### Finance – Ensuring Accurate Transactions

**Task 1**: Transaction Data Validation Insights

**Objective**: Maintain transaction integrity.

**Steps**:
1. Choose a sample financial transaction dataset.
2. Identify common transaction issues like duplicate entries or incorrect amounts.
3. Develop a list of validation checks specific to financial transactions.

In [None]:
# Write your code from here

**Task 2**: Implement Financial Data Validation

**Objective**: Use automated tools to ensure transaction accuracy.

**Steps**:
1. Integrate data validation rules into your existing financial systems.
2. Ensure real-time checks to validate data upon entry.

In [None]:
# Write your code from here


In [1]:
# This script demonstrates how to automate data quality checks using Great Expectations.
# It covers setting up a data context, defining expectations, validating data,
# and generating data quality reports.

# --- Prerequisites ---
# Before running this code, you need to install Great Expectations.
# Open your terminal or command prompt and run:
# pip install great_expectations pandas

import pandas as pd
import great_expectations as ge
from great_expectations.data_context import DataContext
import os
import shutil
from datetime import datetime

# Define a directory for the Great Expectations project.
# This will create a 'great_expectations' subdirectory in your current working directory.
ge_project_dir = "my_ge_project"

# Clean up previous GE project if it exists for a fresh start
if os.path.exists(ge_project_dir):
    print(f"Removing existing Great Expectations project directory: {ge_project_dir}")
    shutil.rmtree(ge_project_dir)

# Initialize a Great Expectations data context.
# This creates the 'great_expectations' directory with configuration files.
print(f"Initializing Great Expectations data context in '{ge_project_dir}'...")
context = ge.data_context.DataContext.create(project_dir=ge_project_dir)
print("Great Expectations data context initialized.")

# --- Previous Tasks (Customer Data) ---

print("\n--- Previous Tasks: Customer Data Validation ---")

# Create a sample customer dataset
print("\nCreating a sample customer dataset...")
data_customer = {
    'customer_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 11], # Duplicate customer_id 1
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Heidi', 'Ivan', 'Judy', 'Alice', 'Kyle'],
    'email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com', 'eve@example.com',
              'frank@example.com', 'grace@example.com', 'heidi@example.com', 'ivan@example.com', 'judy@example.com',
              'alice@example.com', 'kyle@example.com'],
    'age': [25, 30, 35, 40, 28, 32, 29, 45, 22, 38, 25, None], # Missing age for Kyle
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose', 'New York', 'Austin'],
    'registration_date': ['2023-01-15', '2022-03-20', '2023-07-01', '2021-11-10', '2023-02-28',
                          '2022-09-05', '2023-04-12', '2021-06-30', '2023-05-18', '2022-01-01',
                          '2023-01-15', '2023-10-25'],
    'order_count': [5, 12, 8, 20, 3, 15, 7, 25, 2, 10, 5, 6],
    'is_active': [True, True, False, True, True, False, True, True, True, False, True, True]
}
df_customer = pd.DataFrame(data_customer)
print("Customer Data (first 5 rows):")
print(df_customer.head())

# Add customer DataFrame to a Great Expectations Data Source
datasource_name_customer = "customer_data_source"
data_asset_name_customer = "customer_records"

context.add_datasource(
    name=datasource_name_customer,
    class_name="PandasDatasource",
    batch_spec_passthrough={"reader_method": "dataframe"},
)

batch_request_customer = ge.core.batch_request.BatchRequest(
    datasource_name=datasource_name_customer,
    data_asset_name=data_asset_name_customer,
    data_connector_name="default_runtime_data_connector",
    data_connector_query={"batch_filter_parameters": {"batch_data": df_customer}},
)
validator_customer = context.get_validator(
    batch_request=batch_request_customer,
    expectation_suite_name="customer_data_suite"
)

print("\nCreating basic expectations for customer dataset...")
validator_customer.expect_column_to_exist("customer_id")
validator_customer.expect_column_values_to_be_of_type("customer_id", "int")
validator_customer.expect_column_values_to_be_unique("customer_id") # Added unique for customer_id
validator_customer.expect_column_to_exist("name")
validator_customer.expect_column_values_to_be_of_type("name", "str")
validator_customer.expect_column_to_exist("age")
validator_customer.expect_column_values_to_be_of_type("age", "int")
validator_customer.expect_column_values_to_be_between("age", min_value=18, max_value=99, allow_missing=True)
validator_customer.expect_column_to_exist("email")
validator_customer.expect_column_values_to_match_regex("email", r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")
validator_customer.expect_column_to_exist("registration_date")
validator_customer.expect_column_values_to_match_strftime_format("registration_date", "%Y-%m-%d")
validator_customer.expect_column_to_exist("order_count")
validator_customer.expect_column_values_to_be_of_type("order_count", "int")
validator_customer.expect_column_values_to_be_between("order_count", min_value=0, max_value=None)
validator_customer.expect_column_to_exist("is_active")
validator_customer.expect_column_values_to_be_of_type("is_active", "bool")

# Save the customer expectation suite
validator_customer.save_expectation_suite(discard_failed_expectations=False)
print("Customer expectation suite 'customer_data_suite' saved.")

# Run validation for customer data
checkpoint_name_customer = "customer_data_checkpoint"
context.add_checkpoint(
    name=checkpoint_name_customer,
    validator=validator_customer,
    action_list=[
        {"name": "store_validation_result", "action_class": "StoreValidationResultAction"},
        {"name": "store_evaluation_parameter_metrics", "action_class": "StoreEvaluationParametersAction"},
        {"name": "update_data_docs", "action_class": "UpdateDataDocsAction"},
    ],
)
validation_result_customer = context.run_checkpoint(checkpoint_name=checkpoint_name_customer)

print("\nCustomer Data Validation Results Summary:")
if validation_result_customer.success:
    print("Customer data validation successful! All expectations passed.")
else:
    print("Customer data validation failed. Some expectations did not pass.")
    for result in validation_result_customer.results:
        if not result.success:
            print(f"  Failed Expectation: {result.expectation_config.expectation_type} for column '{result.expectation_config.column}'")
            print(f"    Details: {result.result}")

# --- New Task: Finance – Ensuring Accurate Transactions ---

print("\n--- New Task: Finance – Ensuring Accurate Transactions ---")

# Task 1: Transaction Data Validation Insights

print("\n--- Task 1: Transaction Data Validation Insights ---")

# 1. Choose a sample financial transaction dataset.
print("\nCreating a sample financial transaction dataset...")
data_transactions = {
    'transaction_id': ['T001', 'T002', 'T003', 'T004', 'T005', 'T006', 'T007', 'T001'], # Duplicate T001
    'transaction_date': ['2023-01-01', '2023-01-05', '2023-01-10', '2023-01-15', '2023-01-20', '2023-01-25', '2023-01-30', '2023-01-01'],
    'account_id': ['ACC001', 'ACC002', 'ACC001', 'ACC003', 'ACC002', 'ACC004', 'ACC001', 'ACC001'],
    'transaction_type': ['DEBIT', 'CREDIT', 'DEBIT', 'CREDIT', 'DEBIT', 'TRANSFER', 'DEBIT', 'DEBIT'],
    'amount': [100.50, 250.00, 50.25, 120.00, -30.00, 75.00, 200.00, 100.50], # Negative amount
    'currency': ['USD', 'USD', 'EUR', 'USD', 'USD', 'GBP', 'USD', 'USD'],
    'description': ['Groceries', 'Salary', 'Rent', 'Utilities', 'Refund', 'Online Purchase', None, 'Groceries'] # Missing description
}
df_transactions = pd.DataFrame(data_transactions)
print("Financial Transaction Data (first 5 rows):")
print(df_transactions.head())

# Add transaction DataFrame to a Great Expectations Data Source
datasource_name_transactions = "financial_data_source"
data_asset_name_transactions = "transactions"

context.add_datasource(
    name=datasource_name_transactions,
    class_name="PandasDatasource",
    batch_spec_passthrough={"reader_method": "dataframe"},
)

batch_request_transactions = ge.core.batch_request.BatchRequest(
    datasource_name=datasource_name_transactions,
    data_asset_name=data_asset_name_transactions,
    data_connector_name="default_runtime_data_connector",
    data_connector_query={"batch_filter_parameters": {"batch_data": df_transactions}},
)
validator_transactions = context.get_validator(
    batch_request=batch_request_transactions,
    expectation_suite_name="financial_transactions_suite"
)

# 2. Identify common transaction issues and 3. Develop validation checks.
print("\nDeveloping validation checks for financial transactions...")

# Expectation 1: `transaction_id` column exists and is unique
validator_transactions.expect_column_to_exist("transaction_id")
validator_transactions.expect_column_values_to_be_unique("transaction_id")

# Expectation 2: `transaction_date` column exists and is in correct format
validator_transactions.expect_column_to_exist("transaction_date")
validator_transactions.expect_column_values_to_match_strftime_format("transaction_date", "%Y-%m-%d")

# Expectation 3: `account_id` column exists and values are strings
validator_transactions.expect_column_to_exist("account_id")
validator_transactions.expect_column_values_to_be_of_type("account_id", "str")
validator_transactions.expect_column_values_to_match_regex("account_id", r"^ACC\d{3}$") # e.g., ACC001

# Expectation 4: `transaction_type` column exists and values are from a set of expected types
validator_transactions.expect_column_to_exist("transaction_type")
validator_transactions.expect_column_values_to_be_in_set("transaction_type", ["DEBIT", "CREDIT", "TRANSFER", "FEE"])

# Expectation 5: `amount` column exists, is numeric, and is non-negative
validator_transactions.expect_column_to_exist("amount")
validator_transactions.expect_column_values_to_be_of_type("amount", "float")
validator_transactions.expect_column_values_to_be_between("amount", min_value=0.01, max_value=None) # Amount must be positive

# Expectation 6: `currency` column exists and values are from a set of allowed currencies
validator_transactions.expect_column_to_exist("currency")
validator_transactions.expect_column_values_to_be_in_set("currency", ["USD", "EUR", "GBP", "JPY"])

# Expectation 7: `description` column exists and has no more than 10% missing values
validator_transactions.expect_column_to_exist("description")
validator_transactions.expect_column_values_to_not_be_null("description", mostly=0.90)

# Expectation 8: No duplicate entries for a combination of (transaction_id, transaction_date, account_id, amount)
validator_transactions.expect_compound_columns_to_be_unique(["transaction_id", "transaction_date", "account_id", "amount"])

# Save the financial transaction expectation suite
validator_transactions.save_expectation_suite(discard_failed_expectations=False)
print("Financial transaction expectation suite 'financial_transactions_suite' saved.")


# Task 2: Implement Financial Data Validation

print("\n--- Task 2: Implement Financial Data Validation ---")

# Integrate data validation rules (already done by defining expectations above)
# Ensure real-time checks to validate data upon entry (conceptual and integration)

# To simulate real-time checks, you would typically integrate this into:
# 1. An API endpoint where transactions are submitted.
# 2. A message queue consumer that processes new transaction events.
# 3. A database trigger or stored procedure (less common for GE).

print("\nExecuting real-time-like validation for financial transactions...")

# For demonstration, we'll run the validation for the current DataFrame.
# In a real-time system, this would be triggered for each new transaction or batch of transactions.

# Run the checkpoint for financial transactions
checkpoint_name_transactions = "financial_transactions_checkpoint"
context.add_checkpoint(
    name=checkpoint_name_transactions,
    validator=validator_transactions,
    action_list=[
        {"name": "store_validation_result", "action_class": "StoreValidationResultAction"},
        {"name": "store_evaluation_parameter_metrics", "action_class": "StoreEvaluationParametersAction"},
        {"name": "update_data_docs", "action_class": "UpdateDataDocsAction"},
    ],
)
validation_result_transactions = context.run_checkpoint(checkpoint_name=checkpoint_name_transactions)

print("\nFinancial Transaction Validation Results Summary:")
if validation_result_transactions.success:
    print("Financial transaction validation successful! All expectations passed.")
else:
    print("Financial transaction validation failed. Some expectations did not pass.")
    for result in validation_result_transactions.results:
        if not result.success:
            print(f"  Failed Expectation: {result.expectation_config.expectation_type} for column '{result.expectation_config.column}'")
            if result.expectation_config.column: # Check if column is in the expectation config
                print(f"    for column '{result.expectation_config.column}'")
            print(f"    Details: {result.result}")

# Open Data Docs to review both customer and financial transaction reports
print("\nOpening Data Docs to review all validation reports...")
context.open_data_docs()
print("Data Docs report generated and opened in your browser (if supported by your environment).")

# --- Conceptual: Automate Periodic Checks (Scheduling) ---

print("\n--- Conceptual: Automating Periodic Checks (Scheduling) ---")
print("Great Expectations validations for both customer and financial data can be automated using various scheduling tools.")
print("This part is conceptual as it involves external tools or frameworks.")

print("\n1. Cron Jobs (Linux/macOS) / Task Scheduler (Windows):")
print("   You can set up a cron job to run this Python script at a specific interval (e.g., daily).")
print("   Example cron entry (runs daily at 2 AM):")
print(f"   0 2 * * * /usr/bin/python3 {os.path.abspath(__file__)}")

print("\n2. Apache Airflow / Prefect / Dagster:")
print("   Define a DAG (Directed Acyclic Graph) in Airflow (or similar workflow in Prefect/Dagster).")
print("   Each task in the DAG could represent a data quality check for a specific dataset.")
print("   Example Airflow DAG snippet:")
print("   from airflow import DAG")
print("   from airflow.operators.python import PythonOperator")
print("   from datetime import datetime")
print("\n   def run_ge_validation(checkpoint_name):")
print("       context = ge.data_context.DataContext(context_root_dir='my_ge_project')")
print("       validation_result = context.run_checkpoint(checkpoint_name=checkpoint_name)")
print("       if not validation_result.success:")
print("           raise ValueError(f'Validation failed for checkpoint {checkpoint_name}')")
print("\n   with DAG('data_quality_checks', start_date=datetime(2023, 1, 1), schedule_interval='@daily') as dag:")
print("       validate_customer_data = PythonOperator(")
print("           task_id='validate_customer_data',")
print("           python_callable=run_ge_validation,")
print("           op_kwargs={'checkpoint_name': 'customer_data_checkpoint'}")
print("       )")
print("       validate_financial_data = PythonOperator(")
print("           task_id='validate_financial_data',")
print("           python_callable=run_ge_validation,")
print("           op_kwargs={'checkpoint_name': 'financial_transactions_checkpoint'}")
print("       )")
print("       # You can define task dependencies here, e.g., validate_customer_data >> validate_financial_data")

print("\n3. CI/CD Pipelines (e.g., Jenkins, GitLab CI, GitHub Actions):")
print("   Integrate Great Expectations validation into your CI/CD pipeline to ensure data quality")
print("   before deploying new data or models to production.")

print("\nTo truly automate, you would integrate this script into a scheduler or orchestrator.")
print("The Data Docs can then be hosted on a web server for easy access to validation reports.")



ImportError: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)