## Check Accuracy & Completeness

**Objective**: Learn to assess data quality by checking for accuracy and completeness using Python.

For this, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Accuracy
    - Verify Numerical Data Accuracy
    - Validate Email Format
    - Integer Accuracy Check for Age
2. Check Completeness
    - Identify Missing Values
    - Rows with Missing Data
    - Column Specific Missing Value Check

In [None]:
# Write your code from here
import pandas as pd
import re

# Load the sample dataset 'students.csv'
try:
    df = pd.read_csv('students.csv')
except FileNotFoundError:
    print("Error: 'students.csv' not found. Please make sure the file is in the same directory.")
    exit()

print("Original DataFrame:")
print(df)
print("-" * 50)

# --- 1. Check Accuracy ---

# --- Verify Numerical Data Accuracy ---
print("\n--- 1.1. Verify Numerical Data Accuracy (Grade) ---")
# Assuming 'Grade' should be a numerical value within a reasonable range (e.g., 0-100)
df['grade_numerical'] = pd.to_numeric(df['Grade'], errors='coerce').notna()
print("Is 'Grade' numerical:")
print(df[['Grade', 'grade_numerical']])

grade_min = 0
grade_max = 100
df['grade_within_range'] = df['Grade'].apply(lambda x: grade_min <= pd.to_numeric(x, errors='coerce') <= grade_max if pd.notna(pd.to_numeric(x, errors='coerce')) else False)
print(f"\nIs 'Grade' within the range [{grade_min}, {grade_max}]:")
print(df[['Grade', 'grade_within_range']])
print("-" * 50)

# --- Validate Email Format ---
print("\n--- 1.2. Validate Email Format ---")
def is_valid_email(email):
    if isinstance(email, str):
        pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
        return re.match(pattern, email) is not None
    return False

df['email_valid_format'] = df['Email'].apply(is_valid_email)
print("Is 'Email' valid format:")
print(df[['Email', 'email_valid_format']])
print("-" * 50)

# --- Integer Accuracy Check for Age ---
print("\n--- 1.3. Integer Accuracy Check for Age ---")
df['age_is_integer'] = df['Age'].apply(lambda x: isinstance(x, int))
print("Is 'Age' an integer:")
print(df[['Age', 'age_is_integer']])

# Optional: Check for logical age range (e.g., 5-100 for students)
age_min = 5
age_max = 100
df['age_logical'] = df['Age'].apply(lambda x: age_min <= x <= age_max if isinstance(x, int) else False)
print(f"\nIs 'Age' within the logical range [{age_min}, {age_max}]:")
print(df[['Age', 'age_logical']])
print("-" * 50)

# --- 2. Check Completeness ---

# --- Identify Missing Values ---
print("\n--- 2.1. Identify Missing Values ---")
missing_values = df.isnull().sum()
print("Number of missing values per column:")
print(missing_values)
print("-" * 50)

# --- Rows with Missing Data ---
print("\n--- 2.2. Rows with Missing Data ---")
rows_with_missing = df[df.isnull().any(axis=1)]
print("Rows containing at least one missing value:")
print(rows_with_missing)
print("-" * 50)

# --- Column Specific Missing Value Check ---
print("\n--- 2.3. Column Specific Missing Value Check ---")
column_to_check = 'Email'
missing_in_email = df[df[column_to_check].isnull()]
print(f"Rows where '{column_to_check}' is missing:")
print(missing_in_email)
print("-" * 50)

Error: 'students.csv' not found. Please make sure the file is in the same directory.
Original DataFrame:


NameError: name 'df' is not defined

: 