## Format Compliance for Email

**Description**: Check if emails in a list are valid based on a basic regex pattern.

In [None]:
# Write your code from here

In [1]:
import pandas as pd
import re # Import the regular expression module

# --- Task 1: Format Compliance for Email ---
print("--- Task 1: Format Compliance for Email ---")

# Load the customer data dataset (or ensure it's loaded from previous tasks)
try:
    customer_df = pd.read_csv('customer_data.csv')
except FileNotFoundError:
    print("Error: Make sure 'customer_data.csv' is in the same directory.")
    customer_df = pd.DataFrame()

if not customer_df.empty and 'email' in customer_df.columns:
    # Basic regex pattern for email validation.
    # This pattern is simplified. Real-world email validation is more complex.
    # It checks for:
    # ^: start of string
    # [a-zA-Z0-9._%+-]+: one or more alphanumeric, dot, underscore, percent, plus, or hyphen characters (username)
    # @: literal '@'
    # [a-zA-Z0-9.-]+: one or more alphanumeric, dot, or hyphen characters (domain name)
    # \.: literal '.'
    # [a-zA-Z]{2,}: two or more alphabetic characters (top-level domain)
    # $: end of string
    email_regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

    # Function to validate an email using regex
    def validate_email(email):
        if pd.isna(email): # Handle NaN/None values
            return False
        return bool(re.match(email_regex, str(email)))

    # Apply the validation function to the 'email' column
    customer_df['email_is_valid'] = customer_df['email'].apply(validate_email)

    print("\nEmail validation results:")
    print(customer_df[['customer_id', 'email', 'email_is_valid']])

    invalid_emails_df = customer_df[customer_df['email_is_valid'] == False]
    if not invalid_emails_df.empty:
        print("\nRecords with invalid email format:")
        print(invalid_emails_df[['customer_id', 'email']])
        print(f"Total invalid emails found: {len(invalid_emails_df)}")
    else:
        print("\nNo invalid email formats found.")
else:
    print("Skipping Task 1: Email Format Compliance (customer_data.csv not found or 'email' column missing).")

--- Task 1: Format Compliance for Email ---
Error: Make sure 'customer_data.csv' is in the same directory.
Skipping Task 1: Email Format Compliance (customer_data.csv not found or 'email' column missing).


## Format Compliance for Phone Numbers

**Description**: Verify if the phone numbers follow a specific pattern.

In [None]:
# Write your code from here

In [3]:
import pandas as pd
import re

# --- Task 2: Format Compliance for Phone Numbers ---
print("\n--- Task 2: Format Compliance for Phone Numbers ---")

# Load the customer data dataset with phone numbers
try:
    phone_df = pd.read_csv('customer_data_with_phone.csv')
except FileNotFoundError:
    print("Error: Make sure 'customer_data_with_phone.csv' is in the same directory.")
    phone_df = pd.DataFrame()

if not phone_df.empty and 'phone_number' in phone_df.columns:
    # Regex pattern for common North American phone number formats:
    # (XXX) XXX-XXXX, XXX-XXX-XXXX, XXX.XXX.XXXX, XXXXXXXXXX, +X XXX XXX XXXX
    # This is a more comprehensive example, but can be simplified based on requirements.
    # It allows for optional country code, spaces, hyphens, dots, and parentheses.
    phone_regex = r"^\+?\d{1,3}?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$"

    # Function to validate a phone number using regex
    def validate_phone(phone):
        if pd.isna(phone): # Handle NaN/None values
            return False
        # Remove common non-digit characters before matching, if desired for flexibility
        cleaned_phone = re.sub(r'[^0-9+]', '', str(phone)) # Keep numbers and plus sign
        return bool(re.match(phone_regex, str(phone))) # Matching original for strict compliance

    # Apply the validation function to the 'phone_number' column
    phone_df['phone_number_is_valid'] = phone_df['phone_number'].apply(validate_phone)

    print("\nPhone number validation results:")
    print(phone_df[['customer_id', 'phone_number', 'phone_number_is_valid']])

    invalid_phone_numbers_df = phone_df[phone_df['phone_number_is_valid'] == False]
    if not invalid_phone_numbers_df.empty:
        print("\nRecords with invalid phone number format:")
        print(invalid_phone_numbers_df[['customer_id', 'phone_number']])
        print(f"Total invalid phone numbers found: {len(invalid_phone_numbers_df)}")
    else:
        print("\nNo invalid phone number formats found.")
else:
    print("Skipping Task 2: Phone Number Format Compliance (customer_data_with_phone.csv not found or 'phone_number' column missing).")


--- Task 2: Format Compliance for Phone Numbers ---
Error: Make sure 'customer_data_with_phone.csv' is in the same directory.
Skipping Task 2: Phone Number Format Compliance (customer_data_with_phone.csv not found or 'phone_number' column missing).


## Checking Date Validity Format

**Description**: Ensure the dates in a list adhere to a specific format (e.g., YYYY-MM-DD).

In [None]:
# Write your code from here

In [4]:
import pandas as pd

# --- Task 3: Checking Date Validity Format ---
print("\n--- Task 3: Checking Date Validity Format ---")

# Load the event data dataset
try:
    event_df = pd.read_csv('event_data.csv')
except FileNotFoundError:
    print("Error: Make sure 'event_data.csv' is in the same directory.")
    event_df = pd.DataFrame()

if not event_df.empty and 'event_date' in event_df.columns:
    # Define the desired strict format.
    # If the date strings in your CSV are strictly 'YYYY-MM-DD', then format='%Y-%m-%d'
    # If they can be other formats but you want to check if they *can* be converted to datetime,
    # then omit 'format' or try multiple formats.
    # For this task, we want to ensure *adherence* to a specific format, so we'll use 'format'.
    desired_date_format = '%Y-%m-%d'

    # Attempt to convert the 'event_date' column to datetime objects
    # errors='coerce' will turn any unparseable dates into NaT (Not a Time)
    # format=desired_date_format makes the conversion strict to the specified format
    event_df['event_date_parsed'] = pd.to_datetime(event_df['event_date'],
                                                  format=desired_date_format,
                                                  errors='coerce')

    # Identify records where the date could not be parsed into the desired format (or was originally missing)
    event_df['date_format_is_valid'] = event_df['event_date_parsed'].notna()

    print("\nDate format validation results:")
    print(event_df[['event_id', 'event_date', 'event_date_parsed', 'date_format_is_valid']])

    invalid_dates_df = event_df[event_df['date_format_is_valid'] == False]
    if not invalid_dates_df.empty:
        print("\nRecords with invalid or non-compliant date format:")
        print(invalid_dates_df[['event_id', 'event_date']])
        print(f"Total invalid date formats found: {len(invalid_dates_df)}")
    else:
        print("\nAll dates adhere to the desired format (or are missing).")

    # Note: This check also flags original NaNs as invalid against the format.
    # If you need to distinguish between 'missing' and 'malformed', you'd do:
    # 1. Check for original nulls: event_df['event_date'].isnull()
    # 2. Check for malformed (non-null original, but NaT after coerce):
    #    (event_df['event_date'].notnull()) & (event_df['event_date_parsed'].isna())

else:
    print("Skipping Task 3: Date Validity Format (event_data.csv not found or 'event_date' column missing).")


--- Task 3: Checking Date Validity Format ---
Error: Make sure 'event_data.csv' is in the same directory.
Skipping Task 3: Date Validity Format (event_data.csv not found or 'event_date' column missing).
