# Data Validation

## 1️⃣ Introduction
This script performs data validation checks to ensure data quality for both the **Telco Customer Churn Dataset** and the **Synthetic Customer Churn Dataset**. It ensures:
- Detection of missing or inconsistent data
- Validation of data types, formats, and ranges
- Identification of duplicate records and anomalies
- Generation of a comprehensive data quality report

## 2️⃣ Python Code for Data Validation
```python
import pandas as pd
import logging
from datetime import datetime

# Setup logging
logging.basicConfig(filename="data_validation.log", level=logging.INFO,
                    format="%(asctime)s - %(levelname)s - %(message)s")

def log_message(message, level="info"):
    """Helper function to log messages"""
    if level == "info":
        logging.info(message)
    elif level == "error":
        logging.error(message)
    print(message)

# Load datasets
telco_data = pd.read_csv("data_storage/raw/kaggle/telco_churn.csv")
synthetic_data = pd.read_csv("data_storage/raw/synthetic/synthetic_churn.csv")

def validate_data(df, dataset_name):
    """Validates dataset by checking missing values, data types, and duplicates."""
    log_message(f"Validating dataset: {dataset_name}")
    report = {}
    
    # Missing values check
    missing_values = df.isnull().sum()
    report["missing_values"] = missing_values[missing_values > 0].to_dict()
    
    # Data type validation
    expected_types = {
        "customerID": str,
        "Churn": str,
        "tenure": int,
        "MonthlyCharges": float,
        "TotalCharges": str,  # Needs conversion
        "Age": int,
        "Region": str,
        "SupportTickets": int,
        "AvgSessionTime": float
    }
    incorrect_types = {}
    for col, expected_type in expected_types.items():
        if col in df.columns and not df[col].map(lambda x: isinstance(x, expected_type)).all():
            incorrect_types[col] = df[col].dtype
    report["incorrect_types"] = incorrect_types
    
    # Duplicates check
    duplicate_count = df.duplicated().sum()
    report["duplicates"] = duplicate_count
    
    # Log issues
    if missing_values.sum() > 0 or duplicate_count > 0 or incorrect_types:
        log_message(f"Issues found in {dataset_name}: {report}", level="error")
    else:
        log_message(f"No major issues found in {dataset_name}")
    
    return report

# Run validation on both datasets
telco_report = validate_data(telco_data, "Telco Customer Churn Dataset")
synthetic_report = validate_data(synthetic_data, "Synthetic Customer Churn Dataset")

# Save report
report_df = pd.DataFrame({"Telco": telco_report, "Synthetic": synthetic_report})
report_df.to_csv("data_storage/processed/data_quality_report.csv")
log_message("Data validation completed. Report saved.")
```
