Type Checking: Verify that the data is of the expected data types.

In [1]:
import pandas as pd
import numpy as np

# Number of rows in the DataFrame
num_rows = 10

# Generate random age values between 1 and 100, and add some errors
ages = [str(np.random.randint(1, 101)) if np.random.rand() > 0.2 else np.nan for _ in range(num_rows)]
ages[0] = "hello"  # Adding a string error
ages[5] = None     # Adding a null value error

# Create the DataFrame with the "age" column
df = pd.DataFrame({'age': ages})

# Define the validation function
def validate_age(age):
    if pd.isnull(age):
        return "Error: Age value is null."
    try:
        age = int(age)
        if age < 1 or age > 150:
            return "Error: Age must be between 1 and 150."
    except ValueError:
        return "Error: Age must be a positive integer."
    return "Valid"

# Apply the validation function to the "age" column and create a new column for validation status
df['age_validation'] = df['age'].apply(validate_age)

print(df)

     age                          age_validation
0  hello  Error: Age must be a positive integer.
1    NaN               Error: Age value is null.
2      3                                   Valid
3     47                                   Valid
4     27                                   Valid
5   None               Error: Age value is null.
6    NaN               Error: Age value is null.
7    NaN               Error: Age value is null.
8     76                                   Valid
9      5                                   Valid


Key Existence Check: Ensure that required keys are present in data.

In [18]:
import pandas as pd

def validate_student_data(student_data):
    error_messages = []

    for index, row in student_data.iterrows():
        for key, value in row.items():
            if pd.isna(value):
                error_messages.append(f"Missing value in row {index}, column '{key}'.")
            elif key == 'age' and (not isinstance(value, int) or value < 0 or value > 150):
                error_messages.append(f"Invalid age '{value}' in row {index}. Age must be a positive integer between 1 and 150.")

    if error_messages:
        raise ValueError("\n".join(error_messages))

# Sample student data with erroneous values
student_data = {
    'name': ['John', 'Alice', 'Bob', 'Charlie'],
    'age': [25, 23, 'invalid', -5],
    'roll_number': [101, 102, None, 104]
}

# Create the DataFrame
df = pd.DataFrame(student_data)

print(df)

# Validate the DataFrame
try:
    validate_student_data(df)
    print("Data is valid.")
except ValueError as e:
    print(f"Invalid data:\n{str(e)}")


      name      age  roll_number
0     John       25        101.0
1    Alice       23        102.0
2      Bob  invalid          NaN
3  Charlie       -5        104.0
Invalid data:
Invalid age 'invalid' in row 2. Age must be a positive integer between 1 and 150.
Missing value in row 2, column 'roll_number'.
Invalid age '-5' in row 3. Age must be a positive integer between 1 and 150.


Value Range Check: Validate that the data falls within a specified range or set of allowed values.

In [21]:
import pandas as pd

def validate_score(score):
    if not (0 <= score <= 100):
        raise ValueError("Score must be between 0 and 100.")

# Sample scores data
scores_data = {
    'scores': [95, 78, 105, 88, -5, 70, 120]
}

# Create the DataFrame
df = pd.DataFrame(scores_data)

print(df)

# Validate the scores in the DataFrame
invalid_rows = []
for index, row in df.iterrows():
    try:
        validate_score(row['scores'])
    except ValueError as e:
        invalid_rows.append(index)

if invalid_rows:
    print(f"Invalid scores found in rows: {', '.join(map(str, invalid_rows))}.")
else:
    print("All scores are valid.")

   scores
0      95
1      78
2     105
3      88
4      -5
5      70
6     120
Invalid scores found in rows: 2, 4, 6.


Length Check: Verify that the length of the data is within acceptable bounds.

In [24]:
import pandas as pd

def validate_score(score):
    if not (0 <= score <= 100):
        raise ValueError("Score must be between 0 and 100.")

# Sample scores data
scores_data = {
    'scores': [95, 78, 105, 88, -5, 70, 120]
}

# Create the DataFrame
df = pd.DataFrame(scores_data)

print(df)

# Validate the scores in the DataFrame
invalid_rows = []
for index, row in df.iterrows():
    try:
        validate_score(row['scores'])
    except ValueError as e:
        invalid_rows.append(index)

if invalid_rows:
    print(f"Invalid scores found in rows: {', '.join(map(str, invalid_rows))}.")
else:
    print("All scores are valid.")



   scores
0      95
1      78
2     105
3      88
4      -5
5      70
6     120
Invalid scores found in rows: 2, 4, 6.


Format Validation: Ensure that the data follows a specific format (e.g., date format, email format).

In [23]:
import pandas as pd
import re

def validate_email(email):
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    if not re.match(pattern, email):
        raise ValueError("Invalid email format.")

# Sample email data with some erroneous email addresses
email_data = {
    'email': ['john.doe@example.com', 'alice@example', 'invalid.email', 'charlie@com', 'bob@company.com']
}

# Create the DataFrame
df = pd.DataFrame(email_data)

print(df)

# Validate the email addresses in the DataFrame
invalid_indices = []
for index, row in df.iterrows():
    try:
        validate_email(row['email'])
    except ValueError:
        invalid_indices.append(index)

if invalid_indices:
    print(f"Invalid email addresses found in rows: {', '.join(map(str, invalid_indices))}.")
else:
    print("All email addresses are valid.")



                  email
0  john.doe@example.com
1         alice@example
2         invalid.email
3           charlie@com
4       bob@company.com
Invalid email addresses found in rows: 1, 2, 3.


Data Integrity Check: Ensure that the structured data maintains integrity (e.g., relationships between data elements).

Uniqueness Check: Ensure that elements in a list or dictionary are unique.

Membership Check: Verify that elements in the data structure are present in a predefined set.

Required Fields Check: Ensure that specific fields are not empty or contain valid data.

Consistency Check: Validate that data is consistent across related structures (e.g., dictionaries in a list).

Validation Using Regular Expressions: Validate data against defined patterns using regular expressions.

Custom Validation Functions: Implement custom validation functions tailored to specific data requirements.

Business Rule Validation: Validate data against predefined business rules or constraints.

Dependency Check: Ensure that the presence of one field in the data implies the presence or absence of other fields.

Context-Based Validation: Validate data based on the context or state of the application.

Cross-Field Validation: Validate data across multiple fields for consistency.

alidation using External APIs: Validate data by calling external APIs or services.

Concurrency Control: Ensure data consistency in multi-threaded or multi-process environments.

Schema Validation: Use JSON Schema or other schema validation tools to validate structured data.

Strict Mode: Implement a strict mode for data validation, where certain constraints are enforced more rigorously.

Localization Validation: Validate data based on localization and regional standards (e.g., date formats, number formats).