Automated Data Validation Script (Pandas)
------------------------------------------------------
- Reads dataset
- Validates schema, datatypes, ranges, allowed values
- Generates visuals
- Creates a text report with issues and recommendations

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

In [0]:
# Load data
data = pd.read_csv('github_churn_10000.csv')
data

In [0]:
# Ranges for numeric columns
numeric_ranges = {
    "Age": (0, 120),
    "Tenure": (0, None),
    "MonthlyCharges": (0, None),
    "TotalCharges": (0, None)
}

## Validation Checks

In [0]:
validation_results = []

#### Missing Columns check 

In [0]:
# Expected schema
expected_columns = [
    "CustomerID", "Age", "Gender", "Tenure", "MonthlyCharges",
    "TotalCharges", "ContractType", "PaymentMethod"
]

In [0]:
# Column presence
missing_columns = [col for col in expected_columns if col not in data.columns]
if missing_columns:
    validation_results.append(f" Missing columns: {missing_columns}")
else:
    validation_results.append("All expected columns are present.")
    
print(missing_columns)

#### Null values check 

In [0]:
null_counts = data.isnull().sum()
null_issues = null_counts[null_counts > 0]
if not null_issues.empty:
    validation_results.append(f" Null values found:\n{null_issues}")
else:
    validation_results.append(" No null values found.")

print(null_issues)

#### Duplicates check 

In [0]:
dup_count = data.duplicated().sum()
if dup_count > 0:
    validation_results.append(f" Found {dup_count} duplicate rows.")
else:
    validation_results.append(" No duplicate rows.")

print(dup_count)

#### Data type checks (basic)

In [0]:
for col in numeric_ranges.keys():
    if not pd.api.types.is_numeric_dtype(data[col]):
        validation_results.append(f"{col} should be numeric but found {data[col].dtype}.")

#### Range checks

In [0]:
for col, (min_val, max_val) in numeric_ranges.items():
    invalid_mask = (data[col] < min_val) if min_val is not None else pd.Series(False, index=data.index)
    if max_val is not None:
        invalid_mask |= (data[col] > max_val)
    if invalid_mask.any():
        validation_results.append(f"{col} has values outside range {min_val}-{max_val}: {data.loc[invalid_mask, col].tolist()}")
    else:
        validation_results.append(f" {col} values are within range {min_val}-{max_val}.")

#### Allowed categorical values

In [0]:
# Allowed categorical values
allowed_values = {
    "Gender": ["M", "F"],
    "ContractType": ["Month-to-Month", "One Year", "Two Year"],
    "PaymentMethod": ["Electronic Check", "Mailed Check", "Bank Transfer", "Credit Card"]
}

In [0]:
for col, allowed in allowed_values.items():
    invalid_cats = set(data[col].unique()) - set(allowed)
    if invalid_cats:
        validation_results.append(f" {col} contains unexpected values: {invalid_cats}")
    else:
        validation_results.append(f" {col} values match allowed categories.")

## Visualizations

#### Numeric distributions

In [0]:

num_cols = data.select_dtypes(include=[np.number]).columns
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(data[col], kde=True, bins=5)
    plt.title(f"Distribution of {col}")
    plt.show()
    plt.close()

#### Categorical counts

In [0]:
cat_cols = data.select_dtypes(exclude=[np.number]).columns
for col in cat_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=data[col])
    plt.title(f"Value Counts of {col}")
    plt.show()
    plt.close()


In [0]:
validation_results