## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [1]:
# Write your code from here
import schedule
import time
import logging
import pandas as pd
from datetime import datetime

# Configure logging
logging.basicConfig(filename='data_quality_log.txt', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# --- Configuration ---
DATASET_PATH = 'your_dataset.csv'  # Replace with the actual path to your dataset
TRUSTED_DATA_PATH = 'trusted_data.csv' # Optional: Path to trusted data for accuracy checks
ACCURACY_THRESHOLD = 0.95  # Example: Log if accuracy drops below 95%
COMPLETENESS_THRESHOLD = 0.90 # Example: Log if completeness drops below 90%
DATE_COLUMN = 'date'       # Replace with the name of your date column (if applicable)
UNIQUE_ID_COLUMN = 'id'    # Replace with the name of your unique identifier column
COLUMNS_TO_CHECK_COMPLETENESS = ['column1', 'column2', 'amount'] # Columns for completeness check

def calculate_quality_metrics(dataset_path, trusted_data_path=None):
    """
    Calculates data quality metrics (accuracy and completeness) and logs them.
    """
    try:
        df = pd.read_csv(dataset_path)
        num_records = len(df)

        metrics = {}

        # --- Completeness Check ---
        if num_records > 0:
            total_complete_count = 0
            for col in COLUMNS_TO_CHECK_COMPLETENESS:
                if col in df.columns:
                    complete_count = df[col].count()
                    completeness_rate = complete_count / num_records
                    metrics[f'completeness_{col}'] = completeness_rate
                    if completeness_rate < COMPLETENESS_THRESHOLD:
                        logging.warning(f"Completeness for '{col}' is below threshold: {completeness_rate:.2f}")
                    else:
                        logging.info(f"Completeness for '{col}': {completeness_rate:.2f}")
                else:
                    logging.warning(f"Column '{col}' not found for completeness check.")

            overall_completeness = sum(metrics.get(f'completeness_{col}', 1.0) for col in COLUMNS_TO_CHECK_COMPLETENESS) / len(COLUMNS_TO_CHECK_COMPLETENESS) if COLUMNS_TO_CHECK_COMPLETENESS else 1.0
            metrics['overall_completeness'] = overall_completeness
            logging.info(f"Overall Completeness: {overall_completeness:.2f}")
            if overall_completeness < COMPLETENESS_THRESHOLD:
                logging.error(f"Overall completeness is below threshold: {overall_completeness:.2f}")

        else:
            logging.warning("Dataset is empty, cannot calculate completeness.")
            metrics['overall_completeness'] = 0.0

        # --- Accuracy Check (if trusted data is provided) ---
        if trusted_data_path:
            try:
                trusted_df = pd.read_csv(trusted_data_path)
                if UNIQUE_ID_COLUMN in df.columns and UNIQUE_ID_COLUMN in trusted_df.columns:
                    merged_df = pd.merge(df, trusted_df, on=UNIQUE_ID_COLUMN, suffixes=('_current', '_trusted'), how='inner')
                    if not merged_df.empty:
                        # Example: Check if 'price' column matches within a tolerance
                        if 'price_current' in merged_df.columns and 'price_trusted' in merged_df.columns:
                            mismatched_prices = merged_df[abs(merged_df['price_current'] - merged_df['price_trusted']) > 0.01] # Example tolerance
                            accuracy_price = 1 - (len(mismatched_prices) / len(merged_df))
                            metrics['accuracy_price'] = accuracy_price
                            logging.info(f"Price Accuracy: {accuracy_price:.2f}")
                            if accuracy_price < ACCURACY_THRESHOLD:
                                logging.error(f"Price accuracy is below threshold: {accuracy_price:.2f}, {len(mismatched_prices)} mismatches found.")
                        # Add more accuracy checks for other relevant columns
                    else:
                        logging.warning("No matching records found for accuracy check.")
                        metrics['accuracy'] = 1.0 # Assume perfect if no match
                else:
                    logging.error(f"Unique ID column '{UNIQUE_ID_COLUMN}' not found in one or both datasets for accuracy check.")
                    metrics['accuracy'] = None
            except FileNotFoundError:
                logging.error(f"Trusted data file not found: {trusted_data_path}")
                metrics['accuracy'] = None
        else:
            logging.info("Trusted data path not provided, skipping accuracy checks.")
            metrics['accuracy'] = None

        logging.info(f"Calculated Metrics: {metrics}")

    except FileNotFoundError:
        logging.error(f"Dataset file not found: {dataset_path}")
    except Exception as e:
        logging.error(f"An error occurred during metric calculation: {e}")

def job():
    """
    The scheduled job to calculate and log data quality metrics.
    """
    logging.info("Running data quality check...")
    calculate_quality_metrics(DATASET_PATH, TRUSTED_DATA_PATH)
    logging.info("Data quality check completed.")

# Schedule the job to run periodically
schedule.every().hour.do(job) # Example: Run every hour
# schedule.every().day.at("09:00").do(job) # Example: Run every day at 9:00 AM
# schedule.every(5).minutes.do(job) # Example: Run every 5 minutes

if __name__ == "__main__":
    logging.info("Data quality monitoring system started.")
    while True:
        schedule.run_pending()
        time.sleep(1)

ModuleNotFoundError: No module named 'schedule'