Type Checking: Verify that the data is of the expected data types.

In [1]:
import pandas as pd
import numpy as np

# Number of rows in the DataFrame
num_rows = 10

# Generate random age values between 1 and 100, and add some errors
ages = [str(np.random.randint(1, 101)) if np.random.rand() > 0.2 else np.nan for _ in range(num_rows)]
ages[0] = "hello"  # Adding a string error
ages[5] = None     # Adding a null value error

# Create the DataFrame with the "age" column
df = pd.DataFrame({'age': ages})

# Define the validation function
def validate_age(age):
    if pd.isnull(age):
        return "Error: Age value is null."
    try:
        age = int(age)
        if age < 1 or age > 150:
            return "Error: Age must be between 1 and 150."
    except ValueError:
        return "Error: Age must be a positive integer."
    return "Valid"

# Apply the validation function to the "age" column and create a new column for validation status
df['age_validation'] = df['age'].apply(validate_age)

print(df)

     age                          age_validation
0  hello  Error: Age must be a positive integer.
1    NaN               Error: Age value is null.
2      3                                   Valid
3     47                                   Valid
4     27                                   Valid
5   None               Error: Age value is null.
6    NaN               Error: Age value is null.
7    NaN               Error: Age value is null.
8     76                                   Valid
9      5                                   Valid


Key Existence Check: Ensure that required keys are present in data.

In [18]:
import pandas as pd

def validate_student_data(student_data):
    error_messages = []

    for index, row in student_data.iterrows():
        for key, value in row.items():
            if pd.isna(value):
                error_messages.append(f"Missing value in row {index}, column '{key}'.")
            elif key == 'age' and (not isinstance(value, int) or value < 0 or value > 150):
                error_messages.append(f"Invalid age '{value}' in row {index}. Age must be a positive integer between 1 and 150.")

    if error_messages:
        raise ValueError("\n".join(error_messages))

# Sample student data with erroneous values
student_data = {
    'name': ['John', 'Alice', 'Bob', 'Charlie'],
    'age': [25, 23, 'invalid', -5],
    'roll_number': [101, 102, None, 104]
}

# Create the DataFrame
df = pd.DataFrame(student_data)

print(df)

# Validate the DataFrame
try:
    validate_student_data(df)
    print("Data is valid.")
except ValueError as e:
    print(f"Invalid data:\n{str(e)}")


      name      age  roll_number
0     John       25        101.0
1    Alice       23        102.0
2      Bob  invalid          NaN
3  Charlie       -5        104.0
Invalid data:
Invalid age 'invalid' in row 2. Age must be a positive integer between 1 and 150.
Missing value in row 2, column 'roll_number'.
Invalid age '-5' in row 3. Age must be a positive integer between 1 and 150.


Value Range Check: Validate that the data falls within a specified range or set of allowed values.

In [21]:
import pandas as pd

def validate_score(score):
    if not (0 <= score <= 100):
        raise ValueError("Score must be between 0 and 100.")

# Sample scores data
scores_data = {
    'scores': [95, 78, 105, 88, -5, 70, 120]
}

# Create the DataFrame
df = pd.DataFrame(scores_data)

print(df)

# Validate the scores in the DataFrame
invalid_rows = []
for index, row in df.iterrows():
    try:
        validate_score(row['scores'])
    except ValueError as e:
        invalid_rows.append(index)

if invalid_rows:
    print(f"Invalid scores found in rows: {', '.join(map(str, invalid_rows))}.")
else:
    print("All scores are valid.")

   scores
0      95
1      78
2     105
3      88
4      -5
5      70
6     120
Invalid scores found in rows: 2, 4, 6.


Length Check: Verify that the length of the data is within acceptable bounds.

In [24]:
import pandas as pd

def validate_score(score):
    if not (0 <= score <= 100):
        raise ValueError("Score must be between 0 and 100.")

# Sample scores data
scores_data = {
    'scores': [95, 78, 105, 88, -5, 70, 120]
}

# Create the DataFrame
df = pd.DataFrame(scores_data)

print(df)

# Validate the scores in the DataFrame
invalid_rows = []
for index, row in df.iterrows():
    try:
        validate_score(row['scores'])
    except ValueError as e:
        invalid_rows.append(index)

if invalid_rows:
    print(f"Invalid scores found in rows: {', '.join(map(str, invalid_rows))}.")
else:
    print("All scores are valid.")



   scores
0      95
1      78
2     105
3      88
4      -5
5      70
6     120
Invalid scores found in rows: 2, 4, 6.


Format Validation: Ensure that the data follows a specific format (e.g., date format, email format).

In [23]:
import pandas as pd
import re

def validate_email(email):
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    if not re.match(pattern, email):
        raise ValueError("Invalid email format.")

# Sample email data with some erroneous email addresses
email_data = {
    'email': ['john.doe@example.com', 'alice@example', 'invalid.email', 'charlie@com', 'bob@company.com']
}

# Create the DataFrame
df = pd.DataFrame(email_data)

print(df)

# Validate the email addresses in the DataFrame
invalid_indices = []
for index, row in df.iterrows():
    try:
        validate_email(row['email'])
    except ValueError:
        invalid_indices.append(index)

if invalid_indices:
    print(f"Invalid email addresses found in rows: {', '.join(map(str, invalid_indices))}.")
else:
    print("All email addresses are valid.")



                  email
0  john.doe@example.com
1         alice@example
2         invalid.email
3           charlie@com
4       bob@company.com
Invalid email addresses found in rows: 1, 2, 3.


Data Integrity Check: Ensure that the structured data maintains integrity (e.g., relationships between data elements).

In [30]:
import pandas as pd

def validate_order_items(order_items):
    invalid_indices = []
    for index, item in enumerate(order_items):
        if item['quantity'] <= 0:
            invalid_indices.append(index)
    return invalid_indices
    # Further processing...

# Sample order items data with some erroneous quantities
order_items_data = [
    {'item_id': 1, 'quantity': 3},
    {'item_id': 2, 'quantity': 0},
    {'item_id': 3, 'quantity': -2},
    {'item_id': 4, 'quantity': 5},
]

# Create the DataFrame
df = pd.DataFrame(order_items_data)

print(df)

# Validate the order items in the DataFrame
invalid_row_indices = validate_order_items(df.to_dict(orient='records'))

if invalid_row_indices:
    print(f"Invalid order items found in rows: {', '.join(map(str, invalid_row_indices))}.")
else:
    print("All order items are valid.")


   item_id  quantity
0        1         3
1        2         0
2        3        -2
3        4         5
Invalid order items found in rows: 1, 2.


Uniqueness Check: Ensure that elements in a list or dictionary are unique.

In [27]:
import pandas as pd

def validate_unique_ids(ids):
    if len(ids) != len(set(ids)):
        raise ValueError("IDs must be unique.")
    # Further processing...

# Sample IDs data with some duplicate IDs
ids_data = {
    'id': [101, 102, 103, 102, 104]
}

# Create the DataFrame
df = pd.DataFrame(ids_data)

print(df)

# Validate the IDs in the DataFrame
try:
    validate_unique_ids(df['id'].tolist())
    print("All IDs are unique.")
except ValueError as e:
    print(f"Invalid IDs:\n{str(e)}")


    id
0  101
1  102
2  103
3  102
4  104
Invalid IDs:
IDs must be unique.


Membership Check: Verify that elements in the data structure are present in a predefined set.

In [29]:
import pandas as pd

def validate_category(category):
    valid_categories = ['Electronics', 'Clothing', 'Books']
    if category not in valid_categories:
        raise ValueError("Invalid category.")

# Sample categories data with some invalid categories
categories_data = {
    'category': ['Electronics', 'Clothing', 'Invalid', 'Books', 'Invalid']
}

# Create the DataFrame
df = pd.DataFrame(categories_data)

print(df)

# Validate the categories in the DataFrame
invalid_row_indices = []
for index, row in df.iterrows():
    try:
        validate_category(row['category'])
    except ValueError:
        invalid_row_indices.append(index)

if invalid_row_indices:
    print(f"Invalid categories found in rows: {', '.join(map(str, invalid_row_indices))}.")
else:
    print("All categories are valid.")


      category
0  Electronics
1     Clothing
2      Invalid
3        Books
4      Invalid
Invalid categories found in rows: 2, 4.


Required Fields Check: Ensure that specific fields are not empty or contain valid data.

In [36]:
import pandas as pd

def validate_user_registration(user_data):
    required_fields = ['username', 'password', 'email']
    for field in required_fields:
        if not user_data.get(field):
            raise ValueError(f"Missing required field: {field}.")

# Sample user registration data with some missing fields
user_data = {
    'username': ['user1', 'user2', None, 'user4'],
    'password': ['pass123', None, 'pass456', 'pass789'],
    'email': ['user1@example.com', 'user2@example.com', 'user3@example.com', None],
}

# Create the DataFrame
df = pd.DataFrame(user_data)

print(df)

# Validate the user registration data in the DataFrame
invalid_row_indices = []
for index, row in df.iterrows():
    try:
        validate_user_registration(row.to_dict())
    except ValueError:
        invalid_row_indices.append(index)

if invalid_row_indices:
    print(f"Invalid user registration data found in rows: {', '.join(map(str, invalid_row_indices))}.")
else:
    print("All user registration data is valid.")


  username password              email
0    user1  pass123  user1@example.com
1    user2     None  user2@example.com
2     None  pass456  user3@example.com
3    user4  pass789               None
Invalid user registration data found in rows: 1, 2, 3.


Consistency Check: Validate that data is consistent across related structures (e.g., dictionaries in a list).

In [46]:
import pandas as pd

def validate_order_data(item_id, price, quantity, total_price, row_index):
    calculated_total_price = price * quantity
    if calculated_total_price != total_price:
        raise ValueError(f"Row {row_index}: Inconsistent total price in order data.")

# Sample order data with some inconsistent total prices
order_data = {
    'item_id': [101, 102, 103, 104],
    'price': [10, 15, 20, 25],
    'quantity': [5, 3, 2, 4],
    'total_price': [65, 50, 40, 90],
}

# Create the DataFrame
df = pd.DataFrame(order_data)

# Add a new column 'row_index' to store the row index for error display
df['row_index'] = df.index + 1

print(df)

# Validate the order data in the DataFrame
invalid_rows = []
for row in df.itertuples(index=False):
    try:
        validate_order_data(row.item_id, row.price, row.quantity, row.total_price, row.row_index)
    except ValueError:
        invalid_rows.append(row.row_index)

if invalid_rows:
    print(f"Inconsistent total price found in rows: {', '.join(map(str, invalid_rows))}.")
else:
    print("All order data is valid.")


   item_id  price  quantity  total_price  row_index
0      101     10         5           65          1
1      102     15         3           50          2
2      103     20         2           40          3
3      104     25         4           90          4
Inconsistent total price found in rows: 1, 2, 4.


Validation Using Regular Expressions: Validate data against defined patterns using regular expressions.

In [49]:
import pandas as pd
import re

def validate_phone_number(phone_number):
    pattern = r'^\d{3}-\d{3}-\d{4}$'
    if not re.match(pattern, phone_number):
        raise ValueError("Invalid phone number format. Use xxx-xxx-xxxx.")

# Sample phone numbers with some invalid formats
phone_numbers = [
    '123-456-7890',
    '987-654-321',    # Missing the last digit
    'abc-def-ghij',   # Invalid characters
    '555-555-5555',   # Valid phone number
    '1234-567-890',   # Extra digit in the first group
]

# Create the DataFrame with the "phone_number" column
df = pd.DataFrame({'phone_number': phone_numbers})

# Add a new column 'row_index' to store the row index for error display
df['row_index'] = df.index

print(df)

# Validate the phone numbers in the DataFrame
invalid_rows = []
for row in df.itertuples(index=False):
    try:
        validate_phone_number(row.phone_number)
    except ValueError:
        invalid_rows.append(row.row_index)

if invalid_rows:
    print(f"Invalid phone number formats found in rows: {', '.join(map(str, invalid_rows))}.")
else:
    print("All phone numbers are valid.")


   phone_number  row_index
0  123-456-7890          0
1   987-654-321          1
2  abc-def-ghij          2
3  555-555-5555          3
4  1234-567-890          4
Invalid phone number formats found in rows: 1, 2, 4.


Custom Validation Functions: Implement custom validation functions tailored to specific data requirements.

Business Rule Validation: Validate data against predefined business rules or constraints.

Dependency Check: Ensure that the presence of one field in the data implies the presence or absence of other fields.

Context-Based Validation: Validate data based on the context or state of the application.

Cross-Field Validation: Validate data across multiple fields for consistency.

alidation using External APIs: Validate data by calling external APIs or services.

Concurrency Control: Ensure data consistency in multi-threaded or multi-process environments.

Schema Validation: Use JSON Schema or other schema validation tools to validate structured data.

Strict Mode: Implement a strict mode for data validation, where certain constraints are enforced more rigorously.

Localization Validation: Validate data based on localization and regional standards (e.g., date formats, number formats).