## Check Uniqueness & Validity

**Objective**: Evaluate data quality by checking for uniqueness and validity of data entries.

For this activity, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Uniqueness
    - Unique IDs
    - Unique Email Addresses
    - Unique Combination

2. Check Validity
    - Validate Age Range
    - Validate Grade Scale
    - Validate Name Format

In [None]:
# Write your code from here
import pandas as pd
import re

# Load the sample dataset 'students.csv'
try:
    df = pd.read_csv('students.csv')
except FileNotFoundError:
    print("Error: 'students.csv' not found. Please make sure the file is in the same directory.")
    exit()

print("Original DataFrame:")
print(df)
print("-" * 50)

# --- 1. Check Uniqueness ---

# --- Unique IDs ---
print("\n--- 1.1. Unique IDs ---")
unique_ids_count = df['ID'].nunique()
total_ids = len(df)
if unique_ids_count == total_ids:
    print("All IDs are unique.")
else:
    duplicate_ids = df[df.duplicated(subset=['ID'], keep=False)]
    print(f"Total IDs: {total_ids}")
    print(f"Unique IDs: {unique_ids_count}")
    print("Duplicate IDs:")
    print(duplicate_ids[['ID']])
print("-" * 50)

# --- Unique Email Addresses ---
print("\n--- 1.2. Unique Email Addresses ---")
unique_emails_count = df['Email'].nunique()
total_emails = len(df)
if unique_emails_count == total_emails:
    print("All Email Addresses are unique.")
else:
    duplicate_emails = df[df.duplicated(subset=['Email'], keep=False)]
    print(f"Total Emails: {total_emails}")
    print(f"Unique Emails: {unique_emails_count}")
    print("Duplicate Email Addresses:")
    print(duplicate_emails[['Email']])
print("-" * 50)

# --- Unique Combination ---
print("\n--- 1.3. Unique Combination (Name and Email) ---")
df['name_email_combination'] = df['Name'].astype(str) + '_' + df['Email'].astype(str)
unique_combinations_count = df['name_email_combination'].nunique()
total_rows = len(df)
if unique_combinations_count == total_rows:
    print("The combination of Name and Email is unique for all records.")
else:
    duplicate_combinations = df[df.duplicated(subset=['Name', 'Email'], keep=False)]
    print(f"Total Records: {total_rows}")
    print(f"Unique Name-Email Combinations: {unique_combinations_count}")
    print("Records with Duplicate Name-Email Combinations:")
    print(duplicate_combinations[['Name', 'Email']])
df.drop(columns=['name_email_combination'], inplace=True) # Drop the temporary column
print("-" * 50)

# --- 2. Check Validity ---

# --- Validate Age Range ---
print("\n--- 2.1. Validate Age Range ---")
min_age = 5
max_age = 100
df['age_valid_range'] = df['Age'].apply(lambda x: min_age <= x <= max_age if pd.notna(x) else False)
invalid_age = df[~df['age_valid_range']]
print(f"Valid Age Range: [{min_age}, {max_age}]")
print("Records with Invalid Age:")
print(invalid_age[['Age']])
print("-" * 50)

# --- Validate Grade Scale ---
print("\n--- 2.2. Validate Grade Scale ---")
valid_grades = ['A', 'B', 'C', 'D', 'F']
df['grade_valid_scale'] = df['Grade'].apply(lambda x: x in valid_grades if pd.notna(x) else False)
invalid_grade = df[~df['grade_valid_scale']]
print(f"Valid Grade Scale: {valid_grades}")
print("Records with Invalid Grade:")
print(invalid_grade[['Grade']])
print("-" * 50)

# --- Validate Name Format ---
print("\n--- 2.3. Validate Name Format ---")
def is_valid_name(name):
    if isinstance(name, str):
        # Allow only alphabetic characters and spaces
        pattern = r"^[a-zA-Z\s]+$"
        return re.match(pattern, name) is not None
    return False

df['name_valid_format'] = df['Name'].apply(is_valid_name)
invalid_name_format = df[~df['name_valid_format']]
print("Is Name in valid format (alphabetic characters and spaces only):")
print(df[['Name', 'name_valid_format']])
print("Records with Invalid Name Format:")
print(invalid_name_format[['Name']])
print("-" * 50)

Error: 'students.csv' not found. Please make sure the file is in the same directory.
Original DataFrame:


NameError: name 'df' is not defined

: 