## Check Uniqueness & Validity

**Objective**: Evaluate data quality by checking for uniqueness and validity of data entries.

For this activity, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Uniqueness
    - Unique IDs
    - Unique Email Addresses
    - Unique Combination

2. Check Validity
    - Validate Age Range
    - Validate Grade Scale
    - Validate Name Format

In [None]:
# Write your code from here

In [3]:
import pandas as pd
import re

# --- Configuration ---
STUDENTS_DATA_PATH = 'swiggy.csv'

# --- Utility Function to Load Data ---
def load_data(file_path):
    """
    Loads a CSV file into a Pandas DataFrame.
    Handles FileNotFoundError gracefully.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"\nSuccessfully loaded {file_path}")
        return df
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure it's in the same directory.")
        return pd.DataFrame() # Return empty DataFrame on error

# Load the student dataset for all tasks
students_df = load_data(STUDENTS_DATA_PATH)

if students_df.empty:
    print("Cannot proceed with data quality checks as the dataset could not be loaded.")
else:
    print("\n--- Objective: Evaluate Data Quality (Uniqueness & Validity) ---")

    # --- 1. Check Uniqueness ---
    print("\n--- 1. Checking Uniqueness ---")

    # --- 1.1 Unique IDs ---
    print("\n--- 1.1 Unique IDs ---")
    is_id_unique = students_df['ID'].is_unique
    print(f"Is 'ID' column unique? {is_id_unique}")
    if not is_id_unique:
        # Show all occurrences of duplicate IDs
        duplicate_ids = students_df[students_df.duplicated(subset=['ID'], keep=False)]
        print("Records with duplicate 'ID's:")
        print(duplicate_ids[['ID', 'Name', 'Email']].sort_values('ID'))
    else:
        print("All 'ID's are unique.")

    # --- 1.2 Unique Email Addresses ---
    print("\n--- 1.2 Unique Email Addresses ---")
    # Check uniqueness, ignoring null email addresses
    non_null_emails = students_df['Email'].dropna()
    is_email_unique = non_null_emails.is_unique
    print(f"Are 'Email' addresses unique (excluding nulls)? {is_email_unique}")
    if not is_email_unique:
        # Show all occurrences of duplicate emails, excluding nulls
        duplicate_emails = students_df[students_df.duplicated(subset=['Email'], keep=False) & students_df['Email'].notnull()]
        print("Records with duplicate 'Email' addresses:")
        print(duplicate_emails[['ID', 'Name', 'Email']].sort_values('Email'))
    else:
        print("All non-null 'Email' addresses are unique.")

    # --- 1.3 Unique Combination (e.g., Name and Age) ---
    print("\n--- 1.3 Unique Combination (e.g., Name and Age) ---")
    # Check uniqueness of a combination of columns, e.g., 'Name' and 'Age'
    # This can help identify if different people might have the same name and age, or if it's a duplicate record.
    is_name_age_combination_unique = students_df.duplicated(subset=['Name', 'Age']).any()
    print(f"Is 'Name' and 'Age' combination unique? {not is_name_age_combination_unique}") # Invert the boolean for clarity
    if is_name_age_combination_unique:
        # Show records where the Name and Age combination is duplicated
        duplicate_name_age_comb = students_df[students_df.duplicated(subset=['Name', 'Age'], keep=False)]
        print("Records with duplicate 'Name' and 'Age' combinations:")
        print(duplicate_name_age_comb[['ID', 'Name', 'Age', 'Email']].sort_values(by=['Name', 'Age']))
    else:
        print("All 'Name' and 'Age' combinations are unique.")


    # --- 2. Check Validity ---
    print("\n--- 2. Checking Validity ---")

    # --- 2.1 Validate Age Range ---
    print("\n--- 2.1 Validate Age Range ---")
    min_age_valid = 0
    max_age_valid = 120

    # Ensure 'Age' is numeric, coercing errors to NaN for validation
    students_df['Age_numeric_for_validation'] = pd.to_numeric(students_df['Age'], errors='coerce')

    invalid_age_range_records = students_df[
        (students_df['Age_numeric_for_validation'].isna() & students_df['Age'].notnull()) | # Non-numeric
        (students_df['Age_numeric_for_validation'] < min_age_valid) |
        (students_df['Age_numeric_for_validation'] > max_age_valid)
    ]

    if not invalid_age_range_records.empty:
        print(f"Records with 'Age' outside the valid range ({min_age_valid}-{max_age_valid}) or non-numeric:")
        print(invalid_age_range_records[['ID', 'Name', 'Age']])
    else:
        print("All 'Age' values are within the valid range and are numeric.")

    # --- 2.2 Validate Grade Scale ---
    print("\n--- 2.2 Validate Grade Scale ---")
    min_grade_valid = 0.0
    max_grade_valid = 100.0

    # Ensure 'Grade' is numeric, coercing errors to NaN for validation
    students_df['Grade_numeric_for_validation'] = pd.to_numeric(students_df['Grade'], errors='coerce')

    invalid_grade_scale_records = students_df[
        (students_df['Grade_numeric_for_validation'].isna() & students_df['Grade'].notnull()) | # Non-numeric
        (students_df['Grade_numeric_for_validation'] < min_grade_valid) |
        (students_df['Grade_numeric_for_validation'] > max_grade_valid)
    ]

    if not invalid_grade_scale_records.empty:
        print(f"Records with 'Grade' outside the valid scale ({min_grade_valid}-{max_grade_valid}) or non-numeric:")
        print(invalid_grade_scale_records[['ID', 'Name', 'Grade']])
    else:
        print("All 'Grade' values are within the valid scale and are numeric.")

    # --- 2.3 Validate Name Format ---
    print("\n--- 2.3 Validate Name Format ---")
    # Assuming a name should only contain alphabetic characters, spaces, and hyphens (for hyphenated names)
    # This regex checks for one or more word characters (letters, numbers, underscore) or spaces/hyphens.
    # A stricter regex might be r"^[A-Za-z\s'-]+$" for only letters, spaces, apostrophes, and hyphens.
    name_regex = r"^[A-Za-z\s'-]+$"

    def validate_name_format(name):
        if pd.isna(name):
            return False # Consider missing names as invalid format
        return bool(re.fullmatch(name_regex, str(name))) # fullmatch ensures the whole string matches

    students_df['Name_format_valid'] = students_df['Name'].apply(validate_name_format)
    invalid_name_formats = students_df[students_df['Name_format_valid'] == False]

    if not invalid_name_formats.empty:
        print("Records with invalid 'Name' format:")
        print(invalid_name_formats[['ID', 'Name']])
    else:
        print("All 'Name' values adhere to the specified format.")

    # --- Clean up temporary columns used for validity checks ---
    students_df.drop(columns=[col for col in students_df.columns if col.endswith(('_numeric_for_validation', '_format_valid'))], inplace=True)




Successfully loaded swiggy.csv

--- Objective: Evaluate Data Quality (Uniqueness & Validity) ---

--- 1. Checking Uniqueness ---

--- 1.1 Unique IDs ---
Is 'ID' column unique? True
All 'ID's are unique.

--- 1.2 Unique Email Addresses ---


KeyError: 'Email'