## Check Accuracy & Completeness

**Objective**: Learn to assess data quality by checking for accuracy and completeness using Python.

For this, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Accuracy
    - Verify Numerical Data Accuracy
    - Validate Email Format
    - Integer Accuracy Check for Age
2. Check Completeness
    - Identify Missing Values
    - Rows with Missing Data
    - Column Specific Missing Value Check

In [None]:
# Write your code from here

In [1]:
import pandas as pd
import re

# --- Configuration ---
STUDENTS_DATA_PATH = 'swiggy.csv'

# --- Utility Function to Load Data ---
def load_data(file_path):
    """
    Loads a CSV file into a Pandas DataFrame.
    Handles FileNotFoundError gracefully.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"\nSuccessfully loaded {file_path}")
        return df
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure it's in the same directory.")
        return pd.DataFrame() # Return empty DataFrame on error

# Load the student dataset for all tasks
students_df = load_data(STUDENTS_DATA_PATH)

if students_df.empty:
    print("Cannot proceed with data quality checks as the dataset could not be loaded.")
else:
    print("\n--- Objective: Assess Data Quality (Accuracy & Completeness) ---")

    # --- 1. Check Accuracy ---
    print("\n--- 1. Checking Accuracy ---")

    # --- 1.1 Verify Numerical Data Accuracy (Grade) ---
    print("\n--- 1.1 Verify Numerical Data Accuracy (Grade) ---")
    # Assuming Grade should be between 0 and 100
    min_grade = 0.0
    max_grade = 100.0

    # Ensure 'Grade' is numeric, coercing errors to NaN
    students_df['Grade_numeric'] = pd.to_numeric(students_df['Grade'], errors='coerce')

    # Identify grades outside the valid range or non-numeric
    invalid_grades = students_df[
        (students_df['Grade_numeric'].isna() & students_df['Grade'].notnull()) | # Non-numeric grades
        (students_df['Grade_numeric'] < min_grade) |
        (students_df['Grade_numeric'] > max_grade)
    ]

    if not invalid_grades.empty:
        print(f"Records with 'Grade' outside the valid range ({min_grade}-{max_grade}) or non-numeric:")
        print(invalid_grades[['ID', 'Name', 'Grade']])
    else:
        print("All 'Grade' values are within the valid range and are numeric.")

    # --- 1.2 Validate Email Format ---
    print("\n--- 1.2 Validate Email Format ---")
    # Basic regex pattern for email validation
    email_regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

    def validate_email_format(email):
        if pd.isna(email):
            return False # Consider missing emails as invalid format for this check
        return bool(re.match(email_regex, str(email)))

    students_df['Email_format_valid'] = students_df['Email'].apply(validate_email_format)
    invalid_email_formats = students_df[students_df['Email_format_valid'] == False]

    if not invalid_email_formats.empty:
        print("Records with invalid 'Email' format:")
        print(invalid_email_formats[['ID', 'Name', 'Email']])
    else:
        print("All 'Email' addresses adhere to the specified format.")

    # --- 1.3 Integer Accuracy Check for Age ---
    print("\n--- 1.3 Integer Accuracy Check for Age ---")
    # Assuming Age should be an integer and within a reasonable human range (e.g., 0-120)
    min_age = 0
    max_age = 120

    # Ensure 'Age' is numeric, coercing errors to NaN
    students_df['Age_numeric'] = pd.to_numeric(students_df['Age'], errors='coerce')

    # Identify ages that are not integers, outside range, or non-numeric
    invalid_ages = students_df[
        (students_df['Age_numeric'].isna() & students_df['Age'].notnull()) | # Non-numeric ages
        (students_df['Age_numeric'] < min_age) | # Age below min
        (students_df['Age_numeric'] > max_age) | # Age above max
        (students_df['Age_numeric'] != students_df['Age_numeric'].astype(int)) # Not an integer
    ]

    if not invalid_ages.empty:
        print(f"Records with 'Age' outside logical range ({min_age}-{max_age}), non-numeric, or not an integer:")
        print(invalid_ages[['ID', 'Name', 'Age']])
    else:
        print("All 'Age' values are within the logical range and are integers.")

    # --- Clean up temporary columns used for accuracy checks ---
    students_df.drop(columns=[col for col in students_df.columns if col.endswith(('_numeric', '_format_valid'))], inplace=True)


    # --- 2. Check Completeness ---
    print("\n--- 2. Checking Completeness ---")

    # --- 2.1 Identify Missing Values (Overall) ---
    print("\n--- 2.1 Identify Missing Values (Overall) ---")
    print("Total null values across the dataset:")
    print(students_df.isnull().sum().sum()) # Sum of all nulls in all columns

    # --- 2.2 Rows with Missing Data ---
    print("\n--- 2.2 Rows with Missing Data ---")
    # Identify rows that have at least one missing value in any column
    rows_with_missing_data = students_df[students_df.isnull().any(axis=1)]

    if not rows_with_missing_data.empty:
        print("Rows with at least one missing value:")
        print(rows_with_missing_data)
    else:
        print("No rows found with any missing values.")

    # --- 2.3 Column Specific Missing Value Check ---
    print("\n--- 2.3 Column Specific Missing Value Check ---")
    print("Missing values per column:")
    print(students_df.isnull().sum())

    print("\nPercentage of missing values per column:")
    total_rows = len(students_df)
    if total_rows > 0:
        missing_percentage_per_column = (students_df.isnull().sum() / total_rows) * 100
        print(missing_percentage_per_column.round(2).astype(str) + '%')
    else:
        print("No rows in the DataFrame to calculate percentages.")




Successfully loaded swiggy.csv

--- Objective: Assess Data Quality (Accuracy & Completeness) ---

--- 1. Checking Accuracy ---

--- 1.1 Verify Numerical Data Accuracy (Grade) ---


KeyError: 'Grade'