Type Checking: Verify that the data is of the expected data types.

In [1]:
import pandas as pd
import numpy as np

# Number of rows in the DataFrame
num_rows = 10

# Generate random age values between 1 and 100, and add some errors
ages = [str(np.random.randint(1, 101)) if np.random.rand() > 0.2 else np.nan for _ in range(num_rows)]
ages[0] = "hello"  # Adding a string error
ages[5] = None     # Adding a null value error

# Create the DataFrame with the "age" column
df = pd.DataFrame({'age': ages})

# Define the validation function
def validate_age(age):
    if pd.isnull(age):
        return "Error: Age value is null."
    try:
        age = int(age)
        if age < 1 or age > 150:
            return "Error: Age must be between 1 and 150."
    except ValueError:
        return "Error: Age must be a positive integer."
    return "Valid"

# Apply the validation function to the "age" column and create a new column for validation status
df['age_validation'] = df['age'].apply(validate_age)

print(df)

     age                          age_validation
0  hello  Error: Age must be a positive integer.
1    NaN               Error: Age value is null.
2      3                                   Valid
3     47                                   Valid
4     27                                   Valid
5   None               Error: Age value is null.
6    NaN               Error: Age value is null.
7    NaN               Error: Age value is null.
8     76                                   Valid
9      5                                   Valid


Key Existence Check: Ensure that required keys are present in data.

In [18]:
import pandas as pd

def validate_student_data(student_data):
    error_messages = []

    for index, row in student_data.iterrows():
        for key, value in row.items():
            if pd.isna(value):
                error_messages.append(f"Missing value in row {index}, column '{key}'.")
            elif key == 'age' and (not isinstance(value, int) or value < 0 or value > 150):
                error_messages.append(f"Invalid age '{value}' in row {index}. Age must be a positive integer between 1 and 150.")

    if error_messages:
        raise ValueError("\n".join(error_messages))

# Sample student data with erroneous values
student_data = {
    'name': ['John', 'Alice', 'Bob', 'Charlie'],
    'age': [25, 23, 'invalid', -5],
    'roll_number': [101, 102, None, 104]
}

# Create the DataFrame
df = pd.DataFrame(student_data)

print(df)

# Validate the DataFrame
try:
    validate_student_data(df)
    print("Data is valid.")
except ValueError as e:
    print(f"Invalid data:\n{str(e)}")


      name      age  roll_number
0     John       25        101.0
1    Alice       23        102.0
2      Bob  invalid          NaN
3  Charlie       -5        104.0
Invalid data:
Invalid age 'invalid' in row 2. Age must be a positive integer between 1 and 150.
Missing value in row 2, column 'roll_number'.
Invalid age '-5' in row 3. Age must be a positive integer between 1 and 150.
