**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [None]:
# Write your code from here

**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [None]:
# Write your code from here

**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [None]:
# Write your code from here

Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [None]:
# Write your code from here

Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [None]:
# Write your code from here

Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [None]:
# Write your code from here

Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [None]:
# Write your code from here

Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [None]:
# Write your code from here

Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [None]:
# Write your code from here

Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [None]:
# Write your code from here

In [1]:
import pandas as pd
import re
import numpy as np # For numerical operations like mean imputation

# --- Configuration ---
CUSTOMER_DATA_PATH = 'customer_data.csv'
SALES_DATA_PATH = 'sales_data.csv'
PRODUCT_CATALOG_PATH = 'product_catalog.csv'

# --- Utility Function to Load Data ---
def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"\nSuccessfully loaded {file_path}")
        return df
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure it's in the same directory.")
        return pd.DataFrame() # Return empty DataFrame on error

# --- Task 1: Checking Null Values for Completeness ---
print("--- Task 1: Checking Null Values for Completeness ---")
customer_df_t1 = load_data(CUSTOMER_DATA_PATH)
if not customer_df_t1.empty:
    print("Null values per column:")
    print(customer_df_t1.isnull().sum())
else:
    print("Skipping Task 1 due to data loading error.")


# --- Task 2: Checking Data Type Validity ---
print("\n--- Task 2: Checking Data Type Validity ---")
customer_df_t2 = load_data(CUSTOMER_DATA_PATH)
if not customer_df_t2.empty:
    print("Current data types:")
    print(customer_df_t2.dtypes)

    # Expected data types
    expected_dtypes = {
        'customer_id': 'object', # or 'string' in newer pandas
        'name': 'object',
        'age': 'int64',
        'email': 'object',
        'city': 'object',
        'phone_number': 'object',
        'registration_date': 'object' # Will convert to datetime below
    }

    # Attempt to convert 'age' to numeric, coercing errors
    customer_df_t2['age_numeric'] = pd.to_numeric(customer_df_t2['age'], errors='coerce')
    non_numeric_ages = customer_df_t2[customer_df_t2['age_numeric'].isnull() & customer_df_t2['age'].notnull()]
    if not non_numeric_ages.empty:
        print("\nRows with non-numeric 'age' values:")
        print(non_numeric_ages[['customer_id', 'age']])
    else:
        print("\nAll 'age' values are numeric or null.")

    # Attempt to convert 'registration_date' to datetime, coercing errors
    customer_df_t2['registration_date_parsed'] = pd.to_datetime(customer_df_t2['registration_date'], errors='coerce')
    invalid_dates = customer_df_t2[customer_df_t2['registration_date_parsed'].isnull() & customer_df_t2['registration_date'].notnull()]
    if not invalid_dates.empty:
        print("\nRows with invalid 'registration_date' format:")
        print(invalid_dates[['customer_id', 'registration_date']])
    else:
        print("\nAll 'registration_date' values are valid datetime formats or null.")

    print("\nData types after attempting type conversions:")
    print(customer_df_t2.dtypes)

else:
    print("Skipping Task 2 due to data loading error.")


# --- Task 3: Verify Uniqueness of Identifiers ---
print("\n--- Task 3: Verify Uniqueness of Identifiers ---")
customer_df_t3 = load_data(CUSTOMER_DATA_PATH)
if not customer_df_t3.empty:
    # Check uniqueness of 'customer_id'
    is_customer_id_unique = customer_df_t3['customer_id'].is_unique
    print(f"Is 'customer_id' unique? {is_customer_id_unique}")
    if not is_customer_id_unique:
        duplicate_customer_ids = customer_df_t3[customer_df_t3.duplicated(subset=['customer_id'], keep=False)]
        print("Duplicate 'customer_id' entries:")
        print(duplicate_customer_ids[['customer_id', 'name', 'email']].sort_values('customer_id'))

    # Check uniqueness of 'email' (assuming email should be unique for a customer)
    # Exclude nulls as they aren't truly duplicate values to check for uniqueness
    unique_emails_count = customer_df_t3['email'].dropna().nunique()
    total_non_null_emails = customer_df_t3['email'].dropna().count()
    if unique_emails_count != total_non_null_emails:
        print(f"\n'email' is NOT unique (found {total_non_null_emails - unique_emails_count} duplicate email values excluding nulls).")
        duplicate_emails = customer_df_t3[customer_df_t3.duplicated(subset=['email'], keep=False) & customer_df_t3['email'].notnull()]
        print("Duplicate 'email' entries:")
        print(duplicate_emails[['customer_id', 'name', 'email']].sort_values('email'))
    else:
        print("\n'email' is unique (excluding nulls).")

else:
    print("Skipping Task 3 due to data loading error.")


# --- Task 4: Validate Email Format Using Regex (Revisited with dedicated task focus) ---
print("\n--- Task 4: Validate Email Format Using Regex ---")
customer_df_t4 = load_data(CUSTOMER_DATA_PATH)
if not customer_df_t4.empty and 'email' in customer_df_t4.columns:
    email_regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    def validate_email(email):
        if pd.isna(email): return False # Consider missing emails as invalid format for this check
        return bool(re.match(email_regex, str(email)))

    customer_df_t4['email_format_valid'] = customer_df_t4['email'].apply(validate_email)
    invalid_format_emails = customer_df_t4[customer_df_t4['email_format_valid'] == False]

    if not invalid_format_emails.empty:
        print("\nRecords with invalid email format:")
        print(invalid_format_emails[['customer_id', 'email']])
        print(f"Total records with invalid email format: {len(invalid_format_emails)}")
    else:
        print("\nAll emails adhere to the specified format.")
else:
    print("Skipping Task 4 due to data loading error or missing 'email' column.")


# --- Task 5: Check for Logical Age Validity ---
print("\n--- Task 5: Check for Logical Age Validity ---")
customer_df_t5 = load_data(CUSTOMER_DATA_PATH)
if not customer_df_t5.empty and 'age' in customer_df_t5.columns:
    min_age = 0
    max_age = 120

    # Ensure 'age' is numeric for comparison, coercing errors
    customer_df_t5['age_numeric'] = pd.to_numeric(customer_df_t5['age'], errors='coerce')

    # Identify ages outside the reasonable range (and non-numeric ones, as they become NaN)
    invalid_age_range = customer_df_t5[
        (customer_df_t5['age_numeric'] < min_age) | (customer_df_t5['age_numeric'] > max_age) |
        (customer_df_t5['age_numeric'].isna() & customer_df_t5['age'].notnull()) # Catch non-numeric that coerced to NaN
    ]

    if not invalid_age_range.empty:
        print(f"\nRecords with age outside the logical range ({min_age}-{max_age}) or non-numeric:")
        print(invalid_age_range[['customer_id', 'name', 'age']])
        print(f"Total invalid age values found: {len(invalid_age_range)}")
    else:
        print("\nAll age values are within the logical range.")
else:
    print("Skipping Task 5 due to data loading error or missing 'age' column.")


# --- Task 6: Identify and Handle Missing Data (Impute with Mean) ---
print("\n--- Task 6: Identify and Handle Missing Data (Impute with Mean) ---")
customer_df_t6 = load_data(CUSTOMER_DATA_PATH)
if not customer_df_t6.empty:
    print("Original null values:")
    print(customer_df_t6.isnull().sum())

    # Example: Impute missing 'age' with the mean age
    # First, ensure 'age' is numeric
    customer_df_t6['age_numeric'] = pd.to_numeric(customer_df_t6['age'], errors='coerce')
    mean_age = customer_df_t6['age_numeric'].mean()
    print(f"\nMean age for imputation: {mean_age:.2f}")

    customer_df_imputed = customer_df_t6.copy() # Work on a copy
    customer_df_imputed['age_numeric'].fillna(round(mean_age), inplace=True) # Impute age

    # Example: Impute missing 'city' with a placeholder
    customer_df_imputed['city'].fillna('Unknown', inplace=True)

    # Example: Impute missing 'phone_number' with a placeholder
    customer_df_imputed['phone_number'].fillna('N/A', inplace=True)

    print("\nNull values after imputation:")
    print(customer_df_imputed.isnull().sum())
    print("\nSample of DataFrame after imputation (first 5 rows):")
    print(customer_df_imputed.head())
else:
    print("Skipping Task 6 due to data loading error.")


# --- Task 7: Detect Duplicates ---
print("\n--- Task 7: Detect Duplicates ---")
customer_df_t7 = load_data(CUSTOMER_DATA_PATH)
if not customer_df_t7.empty:
    # Detect exact duplicate rows
    duplicate_rows = customer_df_t7[customer_df_t7.duplicated(keep=False)] # keep=False marks all duplicates
    
    if not duplicate_rows.empty:
        print("Detected duplicate rows (exact matches):")
        print(duplicate_rows.sort_values(by=list(customer_df_t7.columns)))
        print(f"Total duplicate rows found: {len(duplicate_rows)}")
    else:
        print("No exact duplicate rows found.")

    # Detect duplicates based on a subset of columns (e.g., customer_id and email might imply same customer)
    # Note: 'customer_id' already handled in Task 3 for uniqueness, but this demonstrates subset checking
    duplicate_id_email = customer_df_t7[customer_df_t7.duplicated(subset=['customer_id', 'email'], keep=False)]
    if not duplicate_id_email.empty:
        print("\nDetected rows with duplicate 'customer_id' and 'email' combination:")
        print(duplicate_id_email.sort_values(by=['customer_id', 'email']))
    else:
        print("\nNo duplicate 'customer_id' and 'email' combinations found.")

else:
    print("Skipping Task 7 due to data loading error.")


# --- Task 8: Validate Correctness of Numerical Values ---
print("\n--- Task 8: Validate Correctness of Numerical Values ---")
sales_df_t8 = load_data(SALES_DATA_PATH)
if not sales_df_t8.empty:
    # Validate 'sales_amount': must be positive
    invalid_sales_amount = sales_df_t8[pd.to_numeric(sales_df_t8['sales_amount'], errors='coerce') <= 0]
    if not invalid_sales_amount.empty:
        print("\nRecords with non-positive 'sales_amount':")
        print(invalid_sales_amount[['transaction_id', 'sales_amount']])
    else:
        print("\nAll 'sales_amount' values are positive.")

    # Validate 'quantity': must be positive integer
    invalid_quantity = sales_df_t8[pd.to_numeric(sales_df_t8['quantity'], errors='coerce') <= 0]
    if not invalid_quantity.empty:
        print("\nRecords with non-positive 'quantity':")
        print(invalid_quantity[['transaction_id', 'quantity']])
    else:
        print("\nAll 'quantity' values are positive.")

    # Validate 'discount_rate': must be between 0 and 1 (inclusive)
    invalid_discount_rate = sales_df_t8[
        (pd.to_numeric(sales_df_t8['discount_rate'], errors='coerce') < 0) |
        (pd.to_numeric(sales_df_t8['discount_rate'], errors='coerce') > 1)
    ]
    if not invalid_discount_rate.empty:
        print("\nRecords with 'discount_rate' outside 0-1 range:")
        print(invalid_discount_rate[['transaction_id', 'discount_rate']])
    else:
        print("\nAll 'discount_rate' values are within the 0-1 range.")
else:
    print("Skipping Task 8 due to data loading error.")


# --- Task 9: Custom Completeness Rule Violation Report ---
print("\n--- Task 9: Custom Completeness Rule Violation Report ---")
customer_df_t9 = load_data(CUSTOMER_DATA_PATH)
if not customer_df_t9.empty:
    # Define mandatory fields
    mandatory_fields = ['name', 'email', 'customer_id']

    # Create a boolean Series indicating if any mandatory field is null for each row
    customer_df_t9['violates_completeness_rule'] = customer_df_t9[mandatory_fields].isnull().any(axis=1)

    # Filter for rows that violate the rule
    violations_report = customer_df_t9[customer_df_t9['violates_completeness_rule']].copy()

    if not violations_report.empty:
        print("\n--- Completeness Rule Violation Report ---")
        print("The following customer profiles violate mandatory field completeness rules:")
        # For each violating row, identify exactly which fields are missing
        violations_report['missing_fields'] = violations_report.apply(
            lambda row: [col for col in mandatory_fields if pd.isna(row[col])], axis=1
        )
        print(violations_report[['customer_id', 'name', 'email', 'missing_fields']])
        print(f"\nTotal profiles violating completeness rules: {len(violations_report)}")
    else:
        print("\nNo profiles violate the mandatory field completeness rules.")
else:
    print("Skipping Task 9 due to data loading error.")


# --- Task 10: Advanced Regex for Data Validity Check ---
print("\n--- Task 10: Advanced Regex for Data Validity Check ---")
product_df_t10 = load_data(PRODUCT_CATALOG_PATH)
if not product_df_t10.empty:
    # Rule 1: Validate 'product_code' format: e.g., 'PROD-XYZ-V1.0' or 'ITEM-123-BETA'
    # Pattern: (PROD|ITEM)-[A-Z0-9]{3,}-[A-Z0-9.]{3,}
    # This allows 'PROD' or 'ITEM' prefix, followed by a hyphen, then 3+ alphanumeric,
    # then a hyphen, then 3+ alphanumeric/dot characters.
    product_code_regex = r"^(PROD|ITEM)-[A-Z0-9]{3,}-[A-Z0-9.]{3,}$"
    def validate_product_code(code):
        if pd.isna(code): return False
        return bool(re.match(product_code_regex, str(code)))

    product_df_t10['product_code_valid'] = product_df_t10['product_code'].apply(validate_product_code)

    # Rule 2: Validate 'version' format: e.g., '1.0', '2.1', '0.9' (numeric.numeric)
    # Pattern: ^\d+\.\d+$ (one or more digits, dot, one or more digits)
    version_regex = r"^\d+\.\d+$"
    def validate_version(version):
        if pd.isna(version): return False
        return bool(re.match(version_regex, str(version)))

    product_df_t10['version_valid'] = product_df_t10['version'].apply(validate_version)

    print("\nAdvanced Regex Validation Results:")
    print(product_df_t10[['product_code', 'product_code_valid', 'version', 'version_valid']])

    invalid_product_codes = product_df_t10[product_df_t10['product_code_valid'] == False]
    if not invalid_product_codes.empty:
        print("\nProducts with invalid 'product_code' format:")
        print(invalid_product_codes[['product_code', 'product_name']])
    else:
        print("\nAll product codes adhere to the specified format.")

    invalid_versions = product_df_t10[product_df_t10['version_valid'] == False]
    if not invalid_versions.empty:
        print("\nProducts with invalid 'version' format:")
        print(invalid_versions[['product_code', 'version']])
    else:
        print("\nAll versions adhere to the specified format.")

else:
    print("Skipping Task 10 due to data loading error.")

--- Task 1: Checking Null Values for Completeness ---
Error: customer_data.csv not found. Please ensure it's in the same directory.
Skipping Task 1 due to data loading error.

--- Task 2: Checking Data Type Validity ---
Error: customer_data.csv not found. Please ensure it's in the same directory.
Skipping Task 2 due to data loading error.

--- Task 3: Verify Uniqueness of Identifiers ---
Error: customer_data.csv not found. Please ensure it's in the same directory.
Skipping Task 3 due to data loading error.

--- Task 4: Validate Email Format Using Regex ---
Error: customer_data.csv not found. Please ensure it's in the same directory.
Skipping Task 4 due to data loading error or missing 'email' column.

--- Task 5: Check for Logical Age Validity ---
Error: customer_data.csv not found. Please ensure it's in the same directory.
Skipping Task 5 due to data loading error or missing 'age' column.

--- Task 6: Identify and Handle Missing Data (Impute with Mean) ---
Error: customer_data.csv not