## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [None]:
# Write your code from here

In [1]:
import pandas as pd
import logging
import datetime
import time # For scheduling (in a simplified loop)
# import schedule # You would use this in a real-world scenario

# --- 1. Setup Logging ---
# Configure logging to save data quality metrics to a file
# This will create a 'data_quality_log.txt' file in the same directory
logging.basicConfig(
    filename='data_quality_log.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
# Also set up a console handler to see output in real-time
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logging.getLogger().addHandler(console_handler)

logging.info("Data Quality Monitoring System Initialized.")

# --- 2. Script to Calculate Metrics (calculate_quality_metrics function) ---
def calculate_quality_metrics():
    """
    Calculates and logs data quality metrics for specified datasets.
    Metrics include:
    - Missing data percentage for 'price' in company_prices.csv
    - Percentage of email conflicts between crm_customers.csv and erp_customers.csv
    """
    logging.info("--- Starting Data Quality Metric Calculation ---")
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # --- Metric 1: Missing Data in company_prices.csv (Completeness) ---
    try:
        company_df = pd.read_csv('company_prices.csv')
        missing_prices_count = company_df['price'].isnull().sum()
        total_prices = len(company_df)
        if total_prices > 0:
            missing_price_percentage = (missing_prices_count / total_prices) * 100
        else:
            missing_price_percentage = 0.0 # Handle empty DataFrame
        logging.info(f"[{current_time}] company_prices.csv - Missing 'price' percentage: {missing_price_percentage:.2f}%")
    except FileNotFoundError:
        logging.error(f"[{current_time}] Error: company_prices.csv not found for completeness check.")
        missing_price_percentage = None
    except Exception as e:
        logging.error(f"[{current_time}] Error calculating missing price percentage: {e}")
        missing_price_percentage = None

    # --- Metric 2: Email Conflicts between CRM and ERP (Accuracy) ---
    try:
        crm_df = pd.read_csv('crm_customers.csv')
        erp_df = pd.read_csv('erp_customers.csv')

        merged_customers_df = pd.merge(crm_df, erp_df, on='customer_id', suffixes=('_crm', '_erp'), how='inner')

        # Find conflicting emails for common customers
        conflicting_emails_df = merged_customers_df[
            (merged_customers_df['email_crm'] != merged_customers_df['email_erp']) &
            (merged_customers_df['email_crm'].notnull()) &
            (merged_customers_df['email_erp'].notnull())
        ]
        total_compared_customers = len(merged_customers_df)
        conflicting_customers_count = len(conflicting_emails_df)

        if total_compared_customers > 0:
            email_conflict_percentage = (conflicting_customers_count / total_compared_customers) * 100
        else:
            email_conflict_percentage = 0.0 # No common customers to compare
        logging.info(f"[{current_time}] CRM/ERP - Email conflict percentage: {email_conflict_percentage:.2f}%")
    except FileNotFoundError:
        logging.error(f"[{current_time}] Error: CRM/ERP customer files not found for conflict check.")
        email_conflict_percentage = None
    except Exception as e:
        logging.error(f"[{current_time}] Error calculating email conflict percentage: {e}")
        email_conflict_percentage = None

    logging.info("--- Finished Data Quality Metric Calculation ---")

    # Optionally, return metrics if you want to store them in a database or another system
    return {
        'timestamp': current_time,
        'missing_price_percentage': missing_price_percentage,
        'email_conflict_percentage': email_conflict_percentage
    }

# --- 3. Implement a Scheduled Script (Conceptual) ---
# For a real-world application, you would use a dedicated scheduler.

def run_monitoring_system_concept(interval_seconds=10, run_duration_seconds=60):
    """
    A conceptual function to demonstrate scheduling.
    In a real scenario, you'd use 'schedule' library or OS-level schedulers (cron, Task Scheduler).
    """
    logging.info(f"Monitoring system will run every {interval_seconds} seconds for {run_duration_seconds} seconds (conceptual).")
    start_time = time.time()
    while (time.time() - start_time) < run_duration_seconds:
        calculate_quality_metrics()
        logging.info(f"Next run in {interval_seconds} seconds...")
        time.sleep(interval_seconds)
    logging.info("Conceptual monitoring run finished.")

# --- How to use in a real scheduling scenario (using 'schedule' library) ---
# Uncomment and install 'schedule' (pip install schedule) for a more robust scheduler

# import schedule
# import time

# def job():
#     calculate_quality_metrics()

# # Schedule the job to run daily at a specific time, or every hour, etc.
# # schedule.every().day.at("03:00").do(job)
# # schedule.every(1).hour.do(job)
# schedule.every(10).seconds.do(job) # For quick testing

# logging.info("Scheduler configured. Waiting to run jobs...")
# while True:
#     schedule.run_pending()
#     time.sleep(1) # Wait one second between checks

# --- Run the conceptual monitoring system ---
if __name__ == "__main__":
    # To run the conceptual loop:
    run_monitoring_system_concept(interval_seconds=5, run_duration_seconds=30)

    # To use the 'schedule' library (uncomment the 'import schedule' and related lines above):
    # print("\nTo run with 'schedule' library, uncomment the relevant lines and install it.")
    # print("For this example, the conceptual loop is running.")

2025-05-21 16:42:10,551 - INFO - Data Quality Monitoring System Initialized.
2025-05-21 16:42:10,558 - INFO - Monitoring system will run every 5 seconds for 30 seconds (conceptual).
2025-05-21 16:42:10,561 - INFO - --- Starting Data Quality Metric Calculation ---
2025-05-21 16:42:10,586 - INFO - [2025-05-21 16:42:10] company_prices.csv - Missing 'price' percentage: 0.00%
2025-05-21 16:42:10,707 - INFO - [2025-05-21 16:42:10] CRM/ERP - Email conflict percentage: 60.00%
2025-05-21 16:42:10,725 - INFO - --- Finished Data Quality Metric Calculation ---
2025-05-21 16:42:10,737 - INFO - Next run in 5 seconds...
2025-05-21 16:42:15,744 - INFO - --- Starting Data Quality Metric Calculation ---
2025-05-21 16:42:15,758 - INFO - [2025-05-21 16:42:15] company_prices.csv - Missing 'price' percentage: 0.00%
2025-05-21 16:42:15,779 - INFO - [2025-05-21 16:42:15] CRM/ERP - Email conflict percentage: 60.00%
2025-05-21 16:42:15,789 - INFO - --- Finished Data Quality Metric Calculation ---
2025-05-21 16: