## Check Accuracy & Completeness

**Objective**: Learn to assess data quality by checking for accuracy and completeness using Python.

For this, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Accuracy
    - Verify Numerical Data Accuracy
    - Validate Email Format
    - Integer Accuracy Check for Age
2. Check Completeness
    - Identify Missing Values
    - Rows with Missing Data
    - Column Specific Missing Value Check

In [6]:
# Write a conceptual framework described in Python pseudo-code:
import pandas as pd
from datetime import datetime

# Step 1: Define the Dataset
class DataQualityFramework:
    def __init__(self, df):
        self.df = df
        self.metrics = {}

    # Step 2: Calculate Completeness
    def calculate_completeness(self):
        # Completeness: Percentage of non-null values in the dataset
        missing_data = self.df.isnull().sum().sum()
        total_data = self.df.size
        completeness = 100 - (missing_data / total_data * 100)
        self.metrics['Completeness'] = completeness
        return completeness

    # Step 3: Calculate Consistency
    def calculate_consistency(self, column, condition=None):
        # Consistency: Check if data in the column meets a specific condition (e.g., unique, valid format)
        if column not in self.df.columns:
            raise ValueError(f"Column '{column}' not found.")
        valid_entries = self.df[column].apply(condition if condition else lambda x: isinstance(x, str) and "@" in x).sum()
        consistency = (valid_entries / len(self.df)) * 100
        self.metrics['Consistency'] = consistency
        return consistency

    # Step 4: Calculate Accuracy (Simple placeholder, assumes data is correct)
    def calculate_accuracy(self, column, valid_range=None):
        # Accuracy: Placeholder for correctness, in a real-world scenario, we might compare to a trusted data source.
        if column not in self.df.columns:
            raise ValueError(f"Column '{column}' not found.")
        if valid_range:
            accurate_entries = self.df[column].between(valid_range[0], valid_range[1]).sum()
            accuracy = (accurate_entries / len(self.df)) * 100
        else:
            # Placeholder: Assume all data is accurate.
            accuracy = 100
        self.metrics['Accuracy'] = accuracy
        return accuracy

    # Step 5: Calculate Timeliness
    def calculate_timeliness(self, date_column):
        # Timeliness: Number of days since the most recent update (assuming 'date_column' exists)
        if date_column not in self.df.columns:
            raise ValueError(f"Column '{date_column}' not found.")
        self.df[date_column] = pd.to_datetime(self.df[date_column], errors='coerce')
        max_date = self.df[date_column].max()
        today = datetime.today()
        timeliness = (today - max_date).days
        self.metrics['Timeliness'] = timeliness
        return timeliness

    # Step 6: Calculate Uniqueness
    def calculate_uniqueness(self, column):
        # Uniqueness: Percentage of unique values in the column
        if column not in self.df.columns:
            raise ValueError(f"Column '{column}' not found.")
        unique_count = self.df[column].nunique()
        uniqueness = (unique_count / len(self.df)) * 100
        self.metrics['Uniqueness'] = uniqueness
        return uniqueness

    # Step 7: Calculate Integrity
    def calculate_integrity(self, column, valid_range=None):
        # Integrity: Placeholder check for valid range or values (e.g., valid range for numeric columns)
        if column not in self.df.columns:
            raise ValueError(f"Column '{column}' not found.")
        if valid_range:
            integrity_count = self.df[column].between(valid_range[0], valid_range[1]).sum()
            integrity = (integrity_count / len(self.df)) * 100
        else:
            # Placeholder: Assume data integrity is perfect.
            integrity = 100
        self.metrics['Integrity'] = integrity
        return integrity

    # Step 8: Summarize all metrics
    def summarize_metrics(self):
        # Return all calculated metrics in a dictionary
        return self.metrics

# Sample usage
data = {
    'customer_id': [1, 2, 3, 4, 5],
    'email': ['john@example.com', 'jane@example.com', None, 'bob@example.com', 'charlie@example.com'],
    'order_date': ['2023-01-01', '2023-03-01', '2022-07-15', '2023-04-10', '2023-02-20'],
    'price': [100, 200, 150, 300, 250]
}
df = pd.DataFrame(data)

# Create DataQualityFramework object
dqf = DataQualityFramework(df)

# Calculate metrics
completeness = dqf.calculate_completeness()
consistency = dqf.calculate_consistency('email', condition=lambda x: isinstance(x, str) and "@" in x)
accuracy = dqf.calculate_accuracy('price', valid_range=(100, 500))
timeliness = dqf.calculate_timeliness('order_date')
uniqueness = dqf.calculate_uniqueness('customer_id')
integrity = dqf.calculate_integrity('price', valid_range=(50, 500))

# Display results
metrics_summary = dqf.summarize_metrics()
print(metrics_summary)

{'Completeness': np.float64(95.0), 'Consistency': np.float64(80.0), 'Accuracy': np.float64(100.0), 'Timeliness': 772, 'Uniqueness': 100.0, 'Integrity': np.float64(100.0)}
