# Data Quality Validation - Products Table

**Table:** `stg_products`  
**Project:** Maven Fuzzy Factory E-Commerce Analytics  
**Created:** November 20, 2025  
**Purpose:** Validate data quality for products staging table

---

## Validation Scope

**Primary Key:** product_id  
**Foreign Keys:** None (master data table)  
**Critical Fields:** created_at, product_name  
**Expected Row Count Range:** 1 - 100

**Validation Checks:**
- Row count within expected range
- Primary key uniqueness (0% duplicates for master data)
- Null checks on critical columns
- Data type validation
- Positive integer validation for IDs
- No future dates
- Valid product name format
- Product name uniqueness

---

## 1. Configuration & Setup

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, StructType, StructField, StringType, TimestampType
from datetime import datetime
import uuid

# Restore Python built-in sum (prevent PySpark function shadowing)
del sum

# Configuration
SOURCE_TABLE = "stg_products"
PK_COLUMN = "product_id"
QUALITY_LOG_TABLE = "data_quality_log"
QUALITY_SUMMARY_TABLE = "data_quality_summary"

# Quality thresholds
MAX_DUPLICATE_PCT = 0.0  # 0% duplicates allowed for master data
MIN_ROW_COUNT = 1
MAX_ROW_COUNT = 100

# Validation run metadata
RUN_ID = str(uuid.uuid4())
RUN_TIMESTAMP = datetime.now()

print(f"Validation Run ID: {RUN_ID}")
print(f"Run Timestamp: {RUN_TIMESTAMP}")
print(f"Source Table: {SOURCE_TABLE}")

## 2. Load Source Data

In [None]:
# Load staging table
df = spark.read.table(SOURCE_TABLE)

print(f"Total Rows: {df.count():,}")
print(f"Total Columns: {len(df.columns)}")
print("\nSchema:")
df.printSchema()

# Display all products (likely small dataset)
print("\nAll Products:")
df.orderBy("product_id").show(100, truncate=False)

## 3. Basic Profiling

In [None]:
# Basic statistics
total_rows = df.count()
distinct_products = df.select(PK_COLUMN).distinct().count()
distinct_names = df.select("product_name").distinct().count()

# Date range
date_stats = df.select(
    min(col("created_at")).alias("min_date"),
    max(col("created_at")).alias("max_date")
).collect()[0]

# Product name analysis
name_stats = df.select(
    length(col("product_name")).alias("name_length")
).agg(
    min(col("name_length")).alias("min_name_length"),
    max(col("name_length")).alias("max_name_length"),
    avg(col("name_length")).alias("avg_name_length")
).collect()[0]

print(f"Total Rows: {total_rows:,}")
print(f"Distinct Products: {distinct_products:,}")
print(f"Distinct Product Names: {distinct_names:,}")
print(f"Duplicate Products: {total_rows - distinct_products:,} ({((total_rows - distinct_products) / total_rows * 100) if total_rows > 0 else 0:.2f}%)")
print(f"Date Range: {date_stats['min_date']} to {date_stats['max_date']}")
print(f"\nProduct Name Statistics:")
print(f"  Min Name Length: {name_stats['min_name_length']} characters")
print(f"  Max Name Length: {name_stats['max_name_length']} characters")
print(f"  Avg Name Length: {name_stats['avg_name_length']:.1f} characters")

## 4. Validation Checks

In [None]:
# Initialize validation results storage
validation_results = []

def add_validation_result(check_name, check_type, column_name, passed, invalid_count, threshold, message):
    """Helper function to store validation results"""
    validation_results.append({
        "run_id": RUN_ID,
        "run_timestamp": RUN_TIMESTAMP,
        "table_name": SOURCE_TABLE,
        "check_name": check_name,
        "check_type": check_type,
        "column_name": column_name,
        "passed": "True" if passed else "False",
        "invalid_count": invalid_count,
        "threshold": threshold,
        "message": message
    })
    
    status = "✓ PASSED" if passed else "✗ FAILED"
    print(f"{status} - {check_name}: {message}")

In [None]:
# Check 1: Row count within expected range
row_count_valid = MIN_ROW_COUNT <= total_rows <= MAX_ROW_COUNT
add_validation_result(
    check_name="Row Count Range",
    check_type="completeness",
    column_name="*",
    passed=row_count_valid,
    invalid_count=0 if row_count_valid else total_rows,
    threshold=f"{MIN_ROW_COUNT}-{MAX_ROW_COUNT}",
    message=f"Row count {total_rows:,} is {'within' if row_count_valid else 'outside'} expected range"
)

In [None]:
# Check 2: Primary key uniqueness (critical for master data)
duplicate_count = total_rows - distinct_products
duplicate_pct = (duplicate_count / total_rows * 100) if total_rows > 0 else 0
pk_valid = duplicate_pct <= MAX_DUPLICATE_PCT

add_validation_result(
    check_name="Primary Key Uniqueness",
    check_type="uniqueness",
    column_name=PK_COLUMN,
    passed=pk_valid,
    invalid_count=duplicate_count,
    threshold=f"<={MAX_DUPLICATE_PCT}%",
    message=f"Found {duplicate_count:,} duplicates ({duplicate_pct:.2f}%)"
)

In [None]:
# Check 3: Null value checks for critical columns
critical_columns = [PK_COLUMN, "created_at", "product_name"]

for col_name in critical_columns:
    null_count = df.filter(col(col_name).isNull()).count()
    null_valid = null_count == 0
    
    add_validation_result(
        check_name=f"Null Check - {col_name}",
        check_type="completeness",
        column_name=col_name,
        passed=null_valid,
        invalid_count=null_count,
        threshold="0",
        message=f"Found {null_count:,} null values"
    )

In [None]:
# Check 4: Positive product IDs
negative_ids = df.filter(col(PK_COLUMN) <= 0).count()
id_valid = negative_ids == 0

add_validation_result(
    check_name="Positive Integer - product_id",
    check_type="validity",
    column_name=PK_COLUMN,
    passed=id_valid,
    invalid_count=negative_ids,
    threshold=">0",
    message=f"Found {negative_ids:,} non-positive IDs"
)

In [None]:
# Check 5: No future dates
current_timestamp = datetime.now()
future_dates = df.filter(col("created_at") > lit(current_timestamp)).count()
date_valid = future_dates == 0

add_validation_result(
    check_name="No Future Dates",
    check_type="validity",
    column_name="created_at",
    passed=date_valid,
    invalid_count=future_dates,
    threshold="<= current_date",
    message=f"Found {future_dates:,} future dates"
)

In [None]:
# Check 6: Valid product name format (non-empty, reasonable length)
invalid_names = df.filter(
    (col("product_name").isNull()) | 
    (trim(col("product_name")) == "") |
    (length(col("product_name")) < 2) |
    (length(col("product_name")) > 100)
).count()
name_valid = invalid_names == 0

add_validation_result(
    check_name="Valid Product Name",
    check_type="validity",
    column_name="product_name",
    passed=name_valid,
    invalid_count=invalid_names,
    threshold="2-100 characters",
    message=f"Found {invalid_names:,} invalid product names"
)

In [None]:
# Check 7: Product name uniqueness (business logic)
duplicate_names = total_rows - distinct_names
name_unique = duplicate_names == 0

add_validation_result(
    check_name="Product Name Uniqueness",
    check_type="uniqueness",
    column_name="product_name",
    passed=name_unique,
    invalid_count=duplicate_names,
    threshold="0 duplicates",
    message=f"Found {duplicate_names:,} duplicate product names"
)

## 5. Calculate Quality Score

In [None]:
# Calculate overall quality score
total_checks = len(validation_results)
passed_checks = sum([1 for r in validation_results if r["passed"] == "True"])
quality_score = (passed_checks / total_checks * 100) if total_checks > 0 else 0
overall_status = "PASSED" if quality_score == 100 else "FAILED"

print("\n" + "="*60)
print(f"QUALITY SCORE: {quality_score:.1f}%")
print(f"CHECKS PASSED: {passed_checks}/{total_checks}")
print(f"OVERALL STATUS: {overall_status}")
print("="*60)

## 6. Persist Results to Quality Log

In [None]:
# Create validation log DataFrame with exact schema matching table
log_schema = StructType([
    StructField("run_id", StringType(), False),
    StructField("run_timestamp", TimestampType(), False),
    StructField("table_name", StringType(), False),
    StructField("check_name", StringType(), False),
    StructField("check_type", StringType(), False),
    StructField("column_name", StringType(), True),
    StructField("passed", StringType(), False),
    StructField("invalid_count", IntegerType(), False),
    StructField("threshold", StringType(), True),
    StructField("message", StringType(), True)
])

validation_log_df = spark.createDataFrame(validation_results, schema=log_schema)

# Write to quality log table (append mode)
validation_log_df.write.mode("append").saveAsTable(QUALITY_LOG_TABLE)

print(f"✓ Validation results written to {QUALITY_LOG_TABLE}")
print(f"  Records written: {len(validation_results)}")

## 7. Persist Summary to Quality Summary Table

In [None]:
# Calculate null violations
null_violations = sum([r["invalid_count"] for r in validation_results if r["check_type"] == "completeness" and "Null Check" in r["check_name"]])

# Create summary record
summary_data = [{
    "run_id": RUN_ID,
    "run_timestamp": RUN_TIMESTAMP,
    "table_name": SOURCE_TABLE,
    "row_count": total_rows,
    "pk_duplicate_count": duplicate_count,
    "null_violations": null_violations,
    "validation_checks_total": total_checks,
    "validation_checks_passed": passed_checks,
    "quality_score": f"{quality_score:.1f}",
    "overall_status": overall_status
}]

summary_schema = StructType([
    StructField("run_id", StringType(), False),
    StructField("run_timestamp", TimestampType(), False),
    StructField("table_name", StringType(), False),
    StructField("row_count", IntegerType(), False),
    StructField("pk_duplicate_count", IntegerType(), False),
    StructField("null_violations", IntegerType(), False),
    StructField("validation_checks_total", IntegerType(), False),
    StructField("validation_checks_passed", IntegerType(), False),
    StructField("quality_score", StringType(), False),
    StructField("overall_status", StringType(), False)
])

summary_df = spark.createDataFrame(summary_data, schema=summary_schema)

# Write to summary table (append mode)
summary_df.write.mode("append").saveAsTable(QUALITY_SUMMARY_TABLE)

print(f"✓ Summary written to {QUALITY_SUMMARY_TABLE}")
print(f"\nValidation Complete!")

## 8. Verification - Query Persisted Results

In [None]:
# Query and display persisted log results for this run
print("Validation Log Records:")
spark.sql(f"""
    SELECT check_name, check_type, column_name, passed, invalid_count, message
    FROM {QUALITY_LOG_TABLE}
    WHERE run_id = '{RUN_ID}'
    ORDER BY check_name
""").show(truncate=False)

# Query and display summary
print("\nQuality Summary:")
spark.sql(f"""
    SELECT table_name, row_count, pk_duplicate_count, null_violations,
           validation_checks_passed, validation_checks_total, 
           quality_score, overall_status
    FROM {QUALITY_SUMMARY_TABLE}
    WHERE run_id = '{RUN_ID}'
""").show(truncate=False)