## Data Quality Framework Implementation

**Description**: Implement a simple data quality measurement framework using ISO 8000 principles to assess key dimensions in a dataset.

In [1]:
# Write a conceptual framework described in Python pseudo-code:
# DataQualityFramework Class

class DataQualityFramework:
    def __init__(self, dataset):
        self.dataset = dataset
        self.quality_metrics = {}

    def assess_completeness(self, columns=None):
        """
        Measures the completeness of specified columns (or all columns).
        Completeness = (Number of non-missing values / Total number of values) * 100
        """
        print("\nAssessing Completeness...")
        completeness_scores = {}
        if columns is None:
            columns = self.dataset.columns
        for col in columns:
            total_values = len(self.dataset[col])
            non_missing_values = self.dataset[col].count()
            completeness = (non_missing_values / total_values) * 100 if total_values > 0 else 0
            completeness_scores[col] = completeness
            print(f"  Completeness of '{col}': {completeness:.2f}%")
        self.quality_metrics['completeness'] = completeness_scores

    def assess_accuracy(self, column, expected_format=None, valid_range=None, custom_rule=None):
        """
        Measures the accuracy of a specified column based on expected format, valid range, or a custom rule.
        Accuracy (simplified) = (Number of valid values / Total number of values) * 100
        """
        print(f"\nAssessing Accuracy for '{column}'...")
        valid_count = 0
        total_values = len(self.dataset[column])
        if total_values == 0:
            accuracy = 0
        else:
            for value in self.dataset[column]:
                is_valid = True
                if expected_format and not isinstance(value, expected_format):
                    is_valid = False
                if valid_range and not (valid_range[0] <= value <= valid_range[1]):
                    is_valid = False
                if custom_rule and not custom_rule(value):
                    is_valid = False
                if is_valid and pd.notna(value): # Consider non-missing and valid
                    valid_count += 1
            accuracy = (valid_count / total_values) * 100
        print(f"  Accuracy of '{column}': {accuracy:.2f}%")
        if 'accuracy' not in self.quality_metrics:
            self.quality_metrics['accuracy'] = {}
        self.quality_metrics['accuracy'][column] = accuracy

    def assess_consistency(self, column1, column2, consistency_rule):
        """
        Measures the consistency between two specified columns based on a given rule.
        Consistency (simplified) = (Number of consistent pairs / Total number of pairs) * 100
        """
        print(f"\nAssessing Consistency between '{column1}' and '{column2}'...")
        consistent_count = 0
        total_pairs = len(self.dataset)
        if total_pairs == 0:
            consistency = 0
        else:
            for index, row in self.dataset.iterrows():
                value1 = row[column1]
                value2 = row[column2]
                if pd.notna(value1) and pd.notna(value2) and consistency_rule(value1, value2):
                    consistent_count += 1
            consistency = (consistent_count / total_pairs) * 100
        print(f"  Consistency between '{column1}' and '{column2}': {consistency:.2f}%")
        if 'consistency' not in self.quality_metrics:
            self.quality_metrics['consistency'] = {}
        self.quality_metrics['consistency'][(column1, column2)] = consistency

    def assess_uniqueness(self, columns):
        """
        Measures the uniqueness of records based on specified columns.
        Uniqueness = (Number of unique records / Total number of records) * 100
        """
        print(f"\nAssessing Uniqueness based on columns: {columns}...")
        total_records = len(self.dataset)
        unique_records = self.dataset.duplicated(subset=columns, keep=False).sum() # Count all duplicates
        uniqueness = ((total_records - unique_records) / total_records) * 100 if total_records > 0 else 0
        print(f"  Uniqueness based on {columns}: {uniqueness:.2f}%")
        self.quality_metrics['uniqueness'] = uniqueness

    def get_quality_report(self):
        """
        Returns a summary of the assessed data quality metrics.
        """
        print("\n--- Data Quality Report ---")
        for metric, scores in self.quality_metrics.items():
            print(f"\n{metric.capitalize()}:")
            if isinstance(scores, dict):
                for item, score in scores.items():
                    print(f"  {item}: {score:.2f}%")
            else:
                print(f"  Overall: {scores:.2f}%")

# --- Example Usage ---
import pandas as pd
import numpy as np

# Sample Dataset
data = {'ID': [1, 2, 3, 4, 5, 1],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', np.nan, 'Alice'],
        'Age': [25, 30, 120, 40, 28, 25],
        'City1': ['Bangalore', 'Mumbai', 'Chennai', 'Bangalore', 'Delhi', 'Mumbai'],
        'City2': ['Bangalore', 'Bombay', 'Chennai', 'Bangalore', 'Delhi', 'Mumbai']}
df = pd.DataFrame(data)

# Initialize the Data Quality Framework
dq_framework = DataQualityFramework(df)

# Assess Completeness
dq_framework.assess_completeness()
dq_framework.assess_completeness(columns=['Name', 'Income']) # Assuming 'Income' was added

# Assess Accuracy
dq_framework.assess_accuracy('Age', valid_range=(0, 100))
dq_framework.assess_accuracy('Name', expected_format=str)

# Assess Consistency
def check_city_consistency(city1, city2):
    # Simple rule: ignore case and 'bay'/'bai' variations
    c1 = str(city1).lower().replace('bay', 'bai')
    c2 = str(city2).lower().replace('bay', 'bai')
    return c1 == c2
dq_framework.assess_consistency('City1', 'City2', check_city_consistency)

# Assess Uniqueness
dq_framework.assess_uniqueness(['ID'])
dq_framework.assess_uniqueness(['Name', 'Age'])

# Get the Quality Report
dq_framework.get_quality_report()


Assessing Completeness...
  Completeness of 'ID': 100.00%
  Completeness of 'Name': 83.33%
  Completeness of 'Age': 100.00%
  Completeness of 'City1': 100.00%
  Completeness of 'City2': 100.00%

Assessing Completeness...
  Completeness of 'Name': 83.33%


KeyError: 'Income'