In [None]:
# Data Quality Metrics & Scoring Examples

# Task 1:
# Assign scores to a customer dataset based on completeness, uniqueness, and consistency.
# Analyze the overall data quality score and identify areas for improvement.





# Task 2:
# Evaluate a dataset for an online shop using metrics such as accuracy, timeliness, and
# integrity. Calculate the data quality score and provide improvement suggestions.





# Task 3:
# Perform a data quality assessment on a financial dataset, scoring it based on validity,
# precision, and accessibility. Review the results and propose corrective actions.





In [1]:
import pandas as pd
import numpy as np

print("--- Data Quality Metrics & Scoring Examples ---")

# --- Task 1: Customer Dataset ---
print("\n--- Task 1: Customer Dataset Quality Scoring ---")

customer_data = pd.DataFrame({
    'CustomerID': [1, 2, 3, 4, 5, 1, 7],
    'Name': ['Alice', 'Bob', None, 'David', 'Eve', 'Alice', 'Frank'],
    'Email': ['alice@example.com', 'bob@sample.org', 'charlie@test.net', 'david@work.net', None, 'alice@example.com', 'frank@home.com'],
    'Phone': ['123-456-7890', '987-654-3210', '555-123-4567', None, '111-222-3333', '123-456-7890', '444-555-6666'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Seattle', 'New York', 'Austin']
})

def score_completeness(df, column):
    total = len(df)
    not_null = df[column].count()
    return (not_null / total) * 100

def score_uniqueness(df, column):
    total = len(df)
    unique = df[column].nunique()
    return (unique / total) * 100

def score_consistency(df, column):
    # Simple example for consistency in categorical data (more complex checks needed for other types)
    if df[column].dtype == 'object':
        value_counts = df[column].value_counts(dropna=False)
        most_frequent_count = value_counts.max()
        total = len(df)
        return (most_frequent_count / total) * 100
    return 100 # Assume consistent if not object for this simple example

completeness_name = score_completeness(customer_data, 'Name')
completeness_email = score_completeness(customer_data, 'Email')
completeness_phone = score_completeness(customer_data, 'Phone')

uniqueness_customer_id = score_uniqueness(customer_data, 'CustomerID')
uniqueness_email = score_uniqueness(customer_data, 'Email')
uniqueness_phone = score_uniqueness(customer_data, 'Phone')

consistency_city = score_consistency(customer_data, 'City')

overall_quality_task1 = np.mean([
    completeness_name, completeness_email, completeness_phone,
    uniqueness_customer_id, uniqueness_email, uniqueness_phone,
    consistency_city
])

print(f"Completeness (Name): {completeness_name:.2f}%")
print(f"Completeness (Email): {completeness_email:.2f}%")
print(f"Completeness (Phone): {completeness_phone:.2f}%")
print(f"Uniqueness (CustomerID): {uniqueness_customer_id:.2f}%")
print(f"Uniqueness (Email): {uniqueness_email:.2f}%")
print(f"Uniqueness (Phone): {uniqueness_phone:.2f}%")
print(f"Consistency (City): {consistency_city:.2f}%")
print(f"\nOverall Data Quality Score (Task 1): {overall_quality_task1:.2f}%")
print("\nAreas for Improvement (Task 1):")
if completeness_name < 100:
    print("- Missing values in 'Name' column.")
if completeness_email < 100:
    print("- Missing values in 'Email' column.")
if completeness_phone < 100:
    print("- Missing values in 'Phone' column.")
if uniqueness_customer_id < 100:
    print("- Duplicate values in 'CustomerID' column.")
if uniqueness_email < 100:
    print("- Duplicate values in 'Email' column.")
if uniqueness_phone < 100:
    print("- Duplicate values in 'Phone' column.")
if consistency_city < 100:
    print("- Inconsistencies in 'City' values (if applicable with more varied data).")

# --- Task 2: Online Shop Dataset ---
print("\n--- Task 2: Online Shop Dataset Quality Scoring ---")

shop_data = pd.DataFrame({
    'OrderID': [1, 2, 3, 4, 5],
    'ProductID': ['A101', 'B202', 'C303', 'A101', 'D404'],
    'ProductName': ['Laptop', 'Mouse', 'Keyboard', 'Laptop', 'Monitor'],
    'OrderDate': ['2023-01-15', '2023-01-20', '2023-02-10', '2023-01-15', '2023-03-01'],
    'DeliveryDate': ['2023-01-18', '2023-01-22', '2023-02-15', '2023-01-18', '2023-03-05'],
    'Price': [1200.00, 25.00, 75.00, 1200, 300.00], # Inconsistent data type for Price in one row
    'StockStatus': ['In Stock', 'Out of Stock', 'In Stock', 'In Stock', 'Low Stock']
})

def score_accuracy(df, column, expected_values=None):
    # Simple example for categorical accuracy
    if expected_values:
        valid_count = df[df[column].isin(expected_values)].count()[column]
        total = len(df)
        return (valid_count / total) * 100
    return 100 # Need specific validation rules for numeric/date

def score_timeliness(df, order_date_col, delivery_date_col):
    df_copy = df.dropna(subset=[order_date_col, delivery_date_col]).copy()
    try:
        df_copy['OrderDate'] = pd.to_datetime(df_copy[order_date_col])
        df_copy['DeliveryDate'] = pd.to_datetime(df_copy[delivery_date_col])
        on_time_deliveries = len(df_copy[df_copy['DeliveryDate'] >= df_copy['OrderDate']])
        total_deliveries = len(df_copy)
        return (on_time_deliveries / total_deliveries) * 100 if total_deliveries > 0 else 100
    except ValueError:
        return 0 # Error in date conversion

def score_integrity(df, unique_key_col):
    return score_uniqueness(df, unique_key_col)

accuracy_stock_status = score_accuracy(shop_data, 'StockStatus', expected_values=['In Stock', 'Out of Stock', 'Low Stock'])
timeliness_delivery = score_timeliness(shop_data, 'OrderDate', 'DeliveryDate')
integrity_order_id = score_integrity(shop_data, 'OrderID')

overall_quality_task2 = np.mean([accuracy_stock_status, timeliness_delivery, integrity_order_id])

print(f"Accuracy (Stock Status): {accuracy_stock_status:.2f}%")
print(f"Timeliness (Delivery): {timeliness_delivery:.2f}%")
print(f"Integrity (OrderID Uniqueness): {integrity_order_id:.2f}%")
print(f"\nOverall Data Quality Score (Task 2): {overall_quality_task2:.2f}%")
print("\nImprovement Suggestions (Task 2):")
if accuracy_stock_status < 100:
    print("- Validate 'StockStatus' values against a predefined list.")
if timeliness_delivery < 100:
    print("- Investigate orders with delivery dates before order dates (potential errors).")
if integrity_order_id < 100:
    print("- Ensure 'OrderID' values are unique.")
if shop_data['Price'].dtype != 'float64':
    print("- Ensure consistent data type for 'Price' column (e.g., numeric).")

# --- Task 3: Financial Dataset ---
print("\n--- Task 3: Financial Dataset Quality Scoring ---")

financial_data = pd.DataFrame({
    'TransactionID': [1001, 1002, 1003, 1004, 1005],
    'AccountID': ['ACC123', 'ACC456', 'INV789', 'ACC123', 'LOAN01'],
    'Amount': [100.50, 200.75, '300', 400.00, 500.25], # Non-numeric value
    'TransactionDate': ['2024-01-01', '2024-01-05', '2024-01-10', '2024-01-01', '2024-01-15'],
    'Currency': ['USD', 'EUR', 'USD', 'USD', 'GBP'],
    'PrecisionCheck': [1.000, 2.00, 3, 4.0000, 5.00] # Varying precision
})

def score_validity(df, column, valid_formats=None, valid_values=None):
    if valid_formats:
        is_valid = df[column].apply(lambda x: any(pd.to_datetime(x, format=fmt, errors='ignore') is not pd.NaT for fmt in valid_formats))
        return (is_valid.sum() / len(df)) * 100
    elif valid_values:
        valid_count = df[df[column].isin(valid_values)].count()[column]
        return (valid_count / len(df)) * 100
    return 100

def score_precision(df, column, decimal_places=2):
    if pd.api.types.is_numeric_dtype(df[column]):
        max_decimals = df[column].astype(str).str.split('.').str[-1].str.len().max()
        if pd.isna(max_decimals):
            return 100
        return (max(0, decimal_places - max_decimals + 1) / (max_decimals + 1)) * 100
    return 100

def score_accessibility(df):
    # Simple check: percentage of non-null values across all columns (proxy for accessibility)
    total_cells = df.size
    non_null_cells = df.count().sum()
    return (non_null_cells / total_cells) * 100

validity_currency = score_validity(financial_data, 'Currency', valid_values=['USD', 'EUR', 'GBP'])
validity_date = score_validity(financial_data, 'TransactionDate', valid_formats=['%Y-%m-%d'])
precision_amount = score_precision(financial_data, 'Amount', decimal_places=2)
precision_check = score_precision(financial_data, 'PrecisionCheck', decimal_places=2)
accessibility_data = score_accessibility(financial_data)

overall_quality_task3 = np.mean([validity_currency, validity_date, precision_amount, precision_check, accessibility_data])

print(f"Validity (Currency): {validity_currency:.2f}%")
print(f"Validity (TransactionDate): {validity_date:.2f}%")
print(f"Precision (Amount): {precision_amount:.2f}%")
print(f"Precision (PrecisionCheck): {precision_check:.2f}%")
print(f"Accessibility (Overall Completeness): {accessibility_data:.2f}%")
print(f"\nOverall Data Quality Score (Task 3): {overall_quality_task3:.2f}%")
print("\nProposed Corrective Actions (Task 3):")
if validity_currency < 100:
    print("- Standardize 'Currency' values to a defined set.")
if validity_date < 100:
    print("- Ensure all 'TransactionDate' values adhere to the 'YYYY-MM-DD' format.")
if precision_amount < 100:
    print("- Convert 'Amount' column to a numeric type and enforce consistent precision (e.g., 2 decimal places).")
if precision_check < 100:
    print("- Standardize the precision of values in the 'PrecisionCheck' column to 2 decimal places.")
if accessibility_data < 100:
    print("- Investigate and handle any missing values across the dataset.")

--- Data Quality Metrics & Scoring Examples ---

--- Task 1: Customer Dataset Quality Scoring ---
Completeness (Name): 85.71%
Completeness (Email): 85.71%
Completeness (Phone): 85.71%
Uniqueness (CustomerID): 85.71%
Uniqueness (Email): 71.43%
Uniqueness (Phone): 71.43%
Consistency (City): 42.86%

Overall Data Quality Score (Task 1): 75.51%

Areas for Improvement (Task 1):
- Missing values in 'Name' column.
- Missing values in 'Email' column.
- Missing values in 'Phone' column.
- Duplicate values in 'CustomerID' column.
- Duplicate values in 'Email' column.
- Duplicate values in 'Phone' column.
- Inconsistencies in 'City' values (if applicable with more varied data).

--- Task 2: Online Shop Dataset Quality Scoring ---
Accuracy (Stock Status): 100.00%
Timeliness (Delivery): 100.00%
Integrity (OrderID Uniqueness): 100.00%

Overall Data Quality Score (Task 2): 100.00%

Improvement Suggestions (Task 2):

--- Task 3: Financial Dataset Quality Scoring ---
Validity (Currency): 100.00%
Validit

  is_valid = df[column].apply(lambda x: any(pd.to_datetime(x, format=fmt, errors='ignore') is not pd.NaT for fmt in valid_formats))
