In [1]:
# Data Quality Metrics & Scoring Examples

# Task 1:
# Assign scores to a customer dataset based on completeness, uniqueness, and consistency.
# Analyze the overall data quality score and identify areas for improvement.





# Task 2:
# Evaluate a dataset for an online shop using metrics such as accuracy, timeliness, and
# integrity. Calculate the data quality score and provide improvement suggestions.





# Task 3:
# Perform a data quality assessment on a financial dataset, scoring it based on validity,
# precision, and accessibility. Review the results and propose corrective actions.





In [2]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

def calculate_completeness(df):
    """Calculates the overall completeness score of a DataFrame."""
    total_cells = df.size
    non_missing_cells = df.count().sum()
    return (non_missing_cells / total_cells) * 100

def calculate_column_completeness(df, column):
    """Calculates the completeness score for a specific column."""
    total_rows = len(df)
    non_missing_count = df[column].count()
    return (non_missing_count / total_rows) * 100

def calculate_uniqueness(df, columns):
    """Calculates the uniqueness score based on specified columns."""
    total_rows = len(df)
    unique_rows = df.drop_duplicates(subset=columns).shape[0]
    return (unique_rows / total_rows) * 100

def calculate_consistency_format(df, column, format_check_function):
    """Calculates the consistency score based on a format check function."""
    valid_count = df[column].apply(format_check_function).sum()
    total_rows = len(df)
    return (valid_count / total_rows) * 100

def is_valid_email(email):
    """Basic email format check."""
    if isinstance(email, str):
        return bool(re.match(r"[^@]+@[^@]+\.[^@]+", email))
    return False

def is_valid_date_format(date_str, format_str):
    """Checks if a string matches a specific date format."""
    if isinstance(date_str, str):
        try:
            datetime.strptime(date_str, format_str)
            return True
        except ValueError:
            return False
    return False

def calculate_accuracy(df_actual, df_expected, on_column):
    """Calculates accuracy by comparing against an expected dataset."""
    merged_df = pd.merge(df_actual, df_expected, on=on_column, how='inner', suffixes=('_actual', '_expected'))
    matching_rows = len(merged_df)
    total_rows_actual = len(df_actual)
    return (matching_rows / total_rows_actual) * 100 if total_rows_actual > 0 else 0

def calculate_timeliness(df, timestamp_column):
    """A basic example; more complex logic might involve expected update frequencies."""
    if timestamp_column in df.columns and pd.api.types.is_datetime64_any_dtype(df[timestamp_column]):
        latest_date = df[timestamp_column].max()
        now = pd.Timestamp('now')
        time_difference = now - latest_date
        # Arbitrary scoring: Higher score for more recent data
        if time_difference.days < 1:
            return 100
        elif time_difference.days < 7:
            return 90
        elif time_difference.days < 30:
            return 75
        else:
            return 50
    return 0

def calculate_integrity_referential(df_parent, df_child, parent_key, child_key):
    """Checks if all foreign keys in the child table exist in the parent table."""
    present_in_parent = df_child[child_key].isin(df_parent[parent_key]).sum()
    total_child_records = len(df_child)
    return (present_in_parent / total_child_records) * 100 if total_child_records > 0 else 0

def calculate_validity_range(df, column, min_val, max_val):
    """Checks if values in a column fall within a specified range."""
    valid_count = df[(df[column] >= min_val) & (df[column] <= max_val)].shape[0]
    total_rows = len(df)
    return (valid_count / total_rows) * 100 if total_rows > 0 else 0

def calculate_precision(df, numeric_column, expected_decimal_places):
    """Checks if numeric values adhere to an expected number of decimal places."""
    valid_count = df[numeric_column].astype(str).apply(lambda x: len(x.split('.')[-1]) <= expected_decimal_places if '.' in x else True).sum()
    total_rows = len(df)
    return (valid_count / total_rows) * 100 if total_rows > 0 else 0

def calculate_accessibility(data_format_supported, metadata_available, data_portal_exists):
    """A simplified scoring of accessibility."""
    score = 0
    if data_format_supported:
        score += 50
    if metadata_available:
        score += 30
    if data_portal_exists:
        score += 20
    return score

def assign_overall_score(scores, weights=None):
    """Calculates an overall data quality score based on individual metric scores and optional weights."""
    if weights is None:
        weights = [1] * len(scores)
    if len(scores) != len(weights):
        raise ValueError("Number of scores and weights must be the same.")
    weighted_sum = sum(score * weight for score, weight in zip(scores, weights))
    total_weight = sum(weights)
    return weighted_sum / total_weight if total_weight > 0 else 0

# --- Task 1: Customer Dataset ---
print("\n--- Task 1: Customer Dataset ---")
customer_data = {'CustomerID': [1, 2, 3, 4, 5, 6],
                 'Name': ['Alice', np.nan, 'Charlie', 'David', 'Eve', 'Alice'],
                 'Email': ['alice@example.com', np.nan, 'charlie@example.com', 'david@test', 'eve@sample.co.uk', 'alice@example.com'],
                 'Phone': ['123-456-7890', '9876543210', '(555) 123-4567', '111-222-3333', np.nan, '123-456-7890'],
                 'RegistrationDate': ['2023-01-15', '2023-02-20', '2023-01-15', '2023/03/01', '2023-02-20', '2023-01-15']}
customer_df = pd.DataFrame(customer_data)

completeness_score_customer = calculate_completeness(customer_df)
uniqueness_score_customer = calculate_uniqueness(customer_df, ['Name', 'Email'])
consistency_email_customer = calculate_consistency_format(customer_df, 'Email', is_valid_email)
consistency_date_customer = calculate_consistency_format(customer_df, 'RegistrationDate', lambda x: is_valid_date_format(x, '%Y-%m-%d'))

customer_scores = [completeness_score_customer, uniqueness_score_customer, consistency_email_customer, consistency_date_customer]
customer_weights = [0.3, 0.3, 0.2, 0.2] # Example weights
overall_score_customer = assign_overall_score(customer_scores, customer_weights)

print(f"Completeness Score: {completeness_score_customer:.2f}%")
print(f"Uniqueness Score (Name, Email): {uniqueness_score_customer:.2f}%")
print(f"Email Consistency Score: {consistency_email_customer:.2f}%")
print(f"Registration Date Consistency Score (YYYY-MM-DD): {consistency_date_customer:.2f}%")
print(f"Overall Data Quality Score: {overall_score_customer:.2f}%")
print("\nAreas for Improvement:")
if completeness_score_customer

SyntaxError: expected ':' (3886258478.py, line 133)