# Measuring Data Accuracy

**Activity Overview**: Assess data accuracy by comparing it to a trusted source and detecting incorrect values or mismatches.

## Title: Product Pricing

**Task**: Compare a dataset of product prices with the latest official price list.

**Steps**:
1. Obtain the latest product price list from the official company website.
2. Compare the dataset's product prices against the verified list.
3. Identify any discrepancies and mark them for correction.

In [None]:
# Write your code from here

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import datetime
import time # For conceptual scheduling

# --- Setup Logging for Monitoring System (from Task: Architecture to Monitor Data Quality Over Time) ---
logging.basicConfig(
    filename='data_quality_log.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logging.getLogger().addHandler(console_handler)

logging.info("Data Quality Monitoring System Initialized.")

# --- Helper Function to Load Data with Error Handling ---
def load_data(file_name):
    """Loads a CSV file into a Pandas DataFrame with error handling."""
    try:
        df = pd.read_csv(file_name)
        logging.info(f"Successfully loaded {file_name}")
        return df
    except FileNotFoundError:
        logging.error(f"Error: {file_name} not found. Please ensure it's in the same directory.")
        return pd.DataFrame() # Return empty DataFrame on error
    except Exception as e:
        logging.error(f"Error loading {file_name}: {e}")
        return pd.DataFrame()

# --- Task 1: Measure Data Accuracy using a Trusted Source (and "Product Pricing" Task) ---
# Description: Check if the prices in company_prices.csv match trusted_prices.csv.
print("\n--- Task 1: Measure Data Accuracy using a Trusted Source (and Product Pricing Task) ---")

company_df_t1 = load_data('company_prices.csv')
trusted_df_t1 = load_data('trusted_prices.csv')

if not company_df_t1.empty and not trusted_df_t1.empty:
    merged_df_t1 = pd.merge(company_df_t1, trusted_df_t1, on='product_id', suffixes=('_company', '_trusted'), how='inner')

    if not merged_df_t1.empty:
        merged_df_t1['price_match'] = (merged_df_t1['price_company'] == merged_df_t1['price_trusted'])
        matching_prices_count = merged_df_t1['price_match'].sum()
        total_compared_products = len(merged_df_t1)
        accuracy = (matching_prices_count / total_compared_products) * 100 if total_compared_products > 0 else 0.0

        print(f"Total products compared (present in both files): {total_compared_products}")
        print(f"Number of matching prices: {matching_prices_count}")
        print(f"Data Accuracy (matching prices) against trusted source: {accuracy:.2f}%")

        discrepancies_t1 = merged_df_t1[merged_df_t1['price_match'] == False]
        if not discrepancies_t1.empty:
            print("\nProducts with price discrepancies:")
            print(discrepancies_t1[['product_id', 'price_company', 'price_trusted']])
        else:
            print("\nNo price discrepancies found between company and trusted data for common products.")
    else:
        print("No common products found between company_prices.csv and trusted_prices.csv to compare.")
else:
    print("Skipping Task 1 due to file loading error.")


# --- Task 2: Detect Incorrect Values ---
# Description: In company_prices.csv, detect any negative price values.
print("\n--- Task 2: Detect Incorrect Values ---")

# Re-using company_df_t1 if it was loaded, otherwise load again
company_df_t2 = company_df_t1 if not company_df_t1.empty else load_data('company_prices.csv')

if not company_df_t2.empty:
    incorrect_prices_df = company_df_t2[company_df_t2['price'] < 0]

    if not incorrect_prices_df.empty:
        print("Detected incorrect (negative) price values:")
        print(incorrect_prices_df)
    else:
        print("No negative price values found in company_prices.csv. All prices appear valid.")
else:
    print("Skipping Task 2 due to file loading error.")


# --- Task 3: Check Missing Data Rates ---
# Description: Calculate the percentage of missing values in customer_data.csv.
print("\n--- Task 3: Check Missing Data Rates ---")

customer_df_t3 = load_data('customer_data.csv')

if not customer_df_t3.empty:
    missing_values_count_t3 = customer_df_t3.isnull().sum()
    total_rows_t3 = len(customer_df_t3)
    if total_rows_t3 > 0:
        missing_values_percentage_t3 = (missing_values_count_t3 / total_rows_t3) * 100
    else:
        missing_values_percentage_t3 = pd.Series(0.0, index=customer_df_t3.columns) # Handle empty DataFrame

    print("Missing values per column:")
    print(missing_values_count_t3)
    print("\nPercentage of missing values per column:")
    print(missing_values_percentage_t3.round(2).astype(str) + '%')

    total_cells_t3 = customer_df_t3.size
    total_missing_cells_t3 = missing_values_count_t3.sum()
    if total_cells_t3 > 0:
        overall_missing_rate_t3 = (total_missing_cells_t3 / total_cells_t3) * 100
        print(f"\nOverall missing data rate across all columns: {overall_missing_rate_t3:.2f}%")
    else:
        print("Customer data DataFrame is empty, cannot calculate overall missing rate.")
else:
    print("Skipping Task 3 due to file loading error.")


# --- Task 4: Handling Partially Available Records ---
# Description: In customer_data.csv, identify records with missing "email" or "phone number" and decide whether to drop or fill them.
print("\n--- Task 4: Handling Partially Available Records ---")

customer_df_t4 = customer_df_t3 if not customer_df_t3.empty else load_data('customer_data.csv')

if not customer_df_t4.empty:
    # We will check for 'email' column based on our sample data.
    # If 'phone number' column exists, include it in the condition:
    # missing_contact_records = customer_df_t4[customer_df_t4['email'].isnull() | customer_df_t4['phone number'].isnull()]
    missing_email_records = customer_df_t4[customer_df_t4['email'].isnull()]

    print("Records with missing 'email' (or 'phone number' if it existed):")
    if not missing_email_records.empty:
        print(missing_email_records)
    else:
        print("No records found with missing 'email' (or 'phone number' if present).")

    # Option 1: Drop records
    print("\n--- Option 1: Dropping records with missing contact info ---")
    customer_df_dropped = customer_df_t4.dropna(subset=['email']) # Use ['email', 'phone number'] if both exist
    print(f"Original number of records: {len(customer_df_t4)}")
    print(f"Number of records after dropping missing 'email': {len(customer_df_dropped)}")
    print("DataFrame after dropping:")
    print(customer_df_dropped.head()) # Use head() for brevity

    # Option 2: Fill missing values
    print("\n--- Option 2: Filling missing contact info with a placeholder ---")
    customer_df_filled = customer_df_t4.copy()
    customer_df_filled['email'].fillna('missing@example.com', inplace=True)
    print("DataFrame after filling missing 'email' with 'missing@example.com':")
    print(customer_df_filled.head()) # Use head() for brevity
else:
    print("Skipping Task 4 due to file loading error.")


# --- Task: Find Conflicting Values Across Datasets ---
# Description: Find customers with conflicting "email" information in crm_customers.csv and erp_customers.csv.
print("\n--- Task: Find Conflicting Values Across Datasets ---")

crm_df_conflict = load_data('crm_customers.csv')
erp_df_conflict = load_data('erp_customers.csv')

if not crm_df_conflict.empty and not erp_df_conflict.empty:
    merged_customers_df = pd.merge(crm_df_conflict, erp_df_conflict, on='customer_id', suffixes=('_crm', '_erp'), how='inner')

    conflicting_emails_df = merged_customers_df[
        (merged_customers_df['email_crm'] != merged_customers_df['email_erp']) &
        (merged_customers_df['email_crm'].notnull()) &
        (merged_customers_df['email_erp'].notnull())
    ]

    if not conflicting_emails_df.empty:
        print("Customers with conflicting email information:")
        print(conflicting_emails_df[['customer_id', 'name_crm', 'email_crm', 'email_erp']])
    else:
        print("No conflicting email information found for customers present in both datasets.")
else:
    print("Skipping conflict detection due to file loading error.")


# --- Task: Compare Data Completeness Over Time ---
# Description: Analyze the trend of missing data in sales_data.csv over several months.
print("\n--- Task: Compare Data Completeness Over Time ---")

sales_df_completeness = load_data('sales_data.csv')

if not sales_df_completeness.empty:
    sales_df_completeness['date'] = pd.to_datetime(sales_df_completeness['date'])
    sales_df_completeness['year_month'] = sales_df_completeness['date'].dt.to_period('M')

    # Columns to check for missing values for this task
    cols_to_check = ['sales_amount', 'region', 'customer_segment']
    # Filter for columns that actually exist in the dataframe
    cols_to_check_existing = [col for col in cols_to_check if col in sales_df_completeness.columns]

    if cols_to_check_existing:

SyntaxError: incomplete input (4265539191.py, line 193)