In [None]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).





# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

In [None]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).











In [5]:
import pandas as pd
import numpy as np
import re
from dateutil import parser
import json

def convert_numpy_types(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_numpy_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(elem) for elem in obj]
    return obj

def standardize_validate_data(df, date_columns=None, numeric_constraints=None, string_format_checks=None, phone_number_columns=None, text_case_columns=None):
    """
    Standardizes data formats, enforces constraints, and addresses inconsistent representations
    in a Pandas DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        date_columns (list, optional): List of column names containing dates. Defaults to None.
        numeric_constraints (dict, optional): Dictionary of column names and their
                                              constraints (e.g., {'age': '> 0'}). Defaults to None.
        string_format_checks (dict, optional): Dictionary of column names and their
                                               format checks (e.g., {'email': 'email'}). Defaults to None.
        phone_number_columns (list, optional): List of column names containing phone numbers. Defaults to None.
        text_case_columns (list, optional): List of column names containing text to standardize case. Defaults to None.

    Returns:
        pd.DataFrame: The processed DataFrame with standardized formats and enforced consistency.
        dict: A dictionary containing validation and standardization reports for each task.
    """
    df_processed = df.copy()
    reports = {}

    # --- Task A: Enforcing Data Formats & Constraints ---

    # 13. Date Format Standardization
    reports['date_format_standardization'] = {}
    if date_columns:
        for col in date_columns:
            if col in df_processed.columns:
                standardized_dates = []
                errors = []
                for index, value in df_processed[col].items():
                    if pd.notna(value):
                        try:
                            parsed_date = parser.parse(str(value))
                            standardized_dates.append(parsed_date.strftime('%Y-%m-%d'))
                        except (ValueError, TypeError):
                            standardized_dates.append(np.nan)
                            errors.append((index, value))
                    else:
                        standardized_dates.append(np.nan)
                df_processed[col] = standardized_dates
                reports['date_format_standardization'][col] = {
                    'total_processed': len(df_processed[col]),
                    'total_standardized': pd.notna(df_processed[col]).sum(),
                    'total_errors': len(errors),
                    'error_samples': errors[:5]
                }
            else:
                reports['date_format_standardization'][col] = {'error': f"Column '{col}' not found."}

    # 14. Numeric Constraints Enforcement
    reports['numeric_constraints_enforcement'] = {}
    if numeric_constraints:
        for col, constraint in numeric_constraints.items():
            if col in df_processed.columns and pd.api.types.is_numeric_dtype(df_processed[col]):
                errors = []
                mask = pd.Series([True] * len(df_processed))
                try:
                    if '>' in constraint:
                        value = float(constraint.split('>')[1].strip())
                        mask = df_processed[col] > value
                    elif '>=' in constraint:
                        value = float(constraint.split('>=')[1].strip())
                        mask = df_processed[col] >= value
                    elif '<' in constraint:
                        value = float(constraint.split('<')[1].strip())
                        mask = df_processed[col] < value
                    elif '<=' in constraint:
                        value = float(constraint.split('<=')[1].strip())
                        mask = df_processed[col] <= value
                    elif '==' in constraint:
                        value = float(constraint.split('==')[1].strip())
                        mask = df_processed[col] == value
                    elif '!=' in constraint:
                        value = float(constraint.split('!=')[1].strip())
                        mask = df_processed[col] != value
                    else:
                        reports['numeric_constraints_enforcement'][col] = {'error': f"Invalid constraint format: '{constraint}'"}

                    invalid_indices = df_processed[~mask].index.tolist()
                    if not invalid_indices:
                        reports['numeric_constraints_enforcement'][col] = {'status': 'All constraints met'}
                    else:
                        violation_samples = df_processed.loc[invalid_indices[:5], col].to_dict()
                        reports['numeric_constraints_enforcement'][col] = {
                            'total_processed': len(df_processed[col]),
                            'total_violations': len(invalid_indices),
                            'violation_samples': convert_numpy_types(violation_samples),
                            'constraint': constraint
                        }
                        # For enforcement, you might want to set invalid values to NaN or handle them differently
                        # df_processed.loc[~mask, col] = np.nan
                except ValueError:
                    reports['numeric_constraints_enforcement'][col] = {'error': f"Invalid constraint value in: '{constraint}'"}
            else:
                reports['numeric_constraints_enforcement'][col] = {'error': f"Column '{col}' not found or is not numeric."}

    # 15. String Format Checks
    reports['string_format_checks'] = {}
    if string_format_checks:
        for col, format_type in string_format_checks.items():
            if col in df_processed.columns and pd.api.types.is_string_dtype(df_processed[col]):
                errors = []
                if format_type == 'email':
                    email_regex = r"[^@]+@[^@]+\.[^@]+"
                    invalid_indices = df_processed[df_processed[col].astype(str).str.match(email_regex) == False].index.tolist()
                    if not invalid_indices:
                        reports['string_format_checks'][col] = {'status': 'All formats met'}
                    else:
                        reports['string_format_checks'][col] = {
                            'total_processed': len(df_processed[col]),
                            'total_violations': len(invalid_indices),
                            'violation_samples': df_processed.loc[invalid_indices[:5], col].to_dict(),
                            'format_type': format_type
                        }
                        # For enforcement, you might want to set invalid values to NaN or handle them
                        # df_processed.loc[invalid_indices, col] = np.nan
                else:
                    reports['string_format_checks'][col] = {'error': f"Unsupported format type: '{format_type}'"}
            else:
                reports['string_format_checks'][col] = {'error': f"Column '{col}' not found or is not string type."}

    # --- Task B: Addressing Inconsistent Representations ---

    # 16. Standardizing Date Formats (Iterative - already done in Task A, but we can refine)
    reports['date_format_standardization_iterative'] = {}
    if date_columns:
        for col in date_columns:
            if col in df_processed.columns:
                standardized_dates = []
                corrections = 0
                error_samples_iterative = []
                original_formats = {}
                for index, value in df_processed[col].items():
                    if pd.notna(value):
                        if isinstance(value, str) and re.match(r'\d{4}-\d{2}-\d{2}', str(value)):
                            standardized_dates.append(value) # Already in the target format
                        else:
                            try:
                                parsed_date = parser.parse(str(value))
                                standardized_date = parsed_date.strftime('%Y-%m-%d')
                                if value != standardized_date:
                                    corrections += 1
                                    original_formats.setdefault(value, 0)
                                    original_formats[value] += 1
                                standardized_dates.append(standardized_date)
                            except (ValueError, TypeError):
                                standardized_dates.append(np.nan)
                                error_samples_iterative.append((index, value))
                    else:
                        standardized_dates.append(np.nan)
                df_processed[col] = standardized_dates
                reports['date_format_standardization_iterative'][col] = {
                    'total_processed': len(df_processed[col]),
                    'total_corrections': corrections,
                    'original_format_counts': dict(sorted(original_formats.items(), key=lambda item: item[1], reverse=True)),
                    'total_remaining_errors': pd.isna(df_processed[col]).sum() - reports['date_format_standardization'][col]['total_errors'] if col in reports['date_format_standardization'] else pd.isna(df_processed[col]).sum(),
                    'remaining_error_samples': error_samples_iterative[:5]
                }
            else:
                reports['date_format_standardization_iterative'][col] = {'error': f"Column '{col}' not found."}

    # 17. Pattern Matching for Consistency (Phone Numbers)
    reports['phone_number_standardization'] = {}
    if phone_number_columns:
        for col in phone_number_columns:
            if col in df_processed.columns and pd.api.types.is_string_dtype(df_processed[col]):
                standardized_numbers = []
                corrections = 0
                original_formats = {}
                for index, value in df_processed[col].items():
                    if pd.notna(value):
                        cleaned_number = re.sub(r'\D+', '', str(value)) # Remove non-digits
                        if len(cleaned_number) == 10:
                            standardized = f"({cleaned_number[:3]}) {cleaned_number[3:6]}-{cleaned_number[6:]}"
                            if value != standardized:
                                corrections += 1
                                original_formats.setdefault(value, 0)
                                original_formats[value] += 1
                            standardized_numbers.append(standardized)
                        else:
                            standardized_numbers.append(np.nan) # Or keep original, depending on requirement
                    else:
                        standardized_numbers.append(np.nan)
                df_processed[col] = standardized_numbers
                reports['phone_number_standardization'][col] = {
                    'total_processed': len(df_processed[col]),
                    'total_standardized': pd.notna(df_processed[col]).sum(),
                    'total_corrections': corrections,
                    'original_format_counts': dict(sorted(original_formats.items(), key=lambda item: item[1], reverse=True)),
                    'total_invalid': pd.isna(df_processed[col]).sum()
                }
            else:
                reports['phone_number_standardization'][col] = {'error': f"Column '{col}' not found or is not string type."}

    # 18. Handling Mixed Case Text
    reports['text_case_standardization'] = {}
    if text_case_columns:
        for col in text_case_columns:
            if col in df_processed.columns and pd.api.types.is_string_dtype(df_processed[col]):
                df_processed[col] = df_processed[col].astype(str).str.upper() # Or .str.lower() for lowercase
                reports['text_case_standardization'][col] = {
                    'total_processed': len(df_processed[col]),
                    'standardized_case': 'uppercase' # Or 'lowercase'
                }
            else:
                reports['text_case_standardization'][col] = {'error': f"Column '{col}' not found or is not string type."}

    return df_processed, convert_numpy_types(reports)

# Example Usage (Corrected data with equal list lengths):
data = {'date_col': ['2023-01-15', '02/20/2023', '2023-Mar-01', '15-01-2024', 'invalid date'],
        'age': [30, -5, 45, 0, 25],
        'email': ['test@example.com', 'INVALID_EMAIL', 'another@domain.net', np.nan, 'missing.at'],
        'phone': ['1234567890', '(123) 456-7890', '123-456-7890', '123.456.7890', 'invalid'],
        'text_entry': ['Mixed Case', 'all lower', 'ALL UPPER', np.nan, 'Some Words']}
df = pd.DataFrame(data)

date_columns_to_standardize = ['date_col']
numeric_constraints_to_enforce = {'age': '> 0'}
string_format_checks_to_perform = {'email': 'email'}
phone_number_columns_to_standardize = ['phone']
text_case_columns_to_uppercase = ['text_entry']

df_processed, reports = standardize_validate_data(
    df.copy(),
    date_columns=date_columns_to_standardize,
    numeric_constraints=numeric_constraints_to_enforce,
    string_format_checks=string_format_checks_to_perform,
    phone_number_columns=phone_number_columns_to_standardize,
    text_case_columns=text_case_columns_to_uppercase
)

print("Processed DataFrame:")
print(df_processed)
print("\nReports:")
import json
print(json.dumps(reports, indent=4))

Processed DataFrame:
     date_col  age               email           phone  text_entry
0  2023-01-15   30    test@example.com  (123) 456-7890  Mixed Case
1  2023-02-20   -5       INVALID_EMAIL  (123) 456-7890   all lower
2  2023-03-01   45  another@domain.net  (123) 456-7890   ALL UPPER
3  2024-01-15    0                 NaN  (123) 456-7890         NaN
4         NaN   25          missing.at             NaN  Some Words

Reports:
{
    "date_format_standardization": {
        "date_col": {
            "total_processed": 5,
            "total_standardized": 4,
            "total_errors": 1,
            "error_samples": [
                [
                    4,
                    "invalid date"
                ]
            ]
        }
    },
    "numeric_constraints_enforcement": {
        "age": {
            "total_processed": 5,
            "total_violations": 2,
            "violation_samples": {
                "1": -5,
                "3": 0
            },
            "constraint