In [0]:
# ==========================================================
# Customer KYC Data Processing ETL Pipeline
# Banking Know Your Customer (KYC) Compliance
# ==========================================================

from pyspark.sql.functions import (
    col, when, lit, length, upper, md5, concat,
    current_timestamp, to_date, year, coalesce, count, avg
)
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Starting KYC Data Processing ETL Pipeline")
print("\n=== KYC DATA PROCESSING ETL ===")

# STAGE 1: EXTRACT
print("\nSTAGE 1: EXTRACT")

kyc_data = [
    (1, "John Doe", "john@bank.com", "1980-05-15", "DL123456", "Active", "Verified"),
    (2, "Jane Smith", "jane@bank.com", "1985-08-22", "PAN789012", "Active", "Verified"),
    (3, "Bob Wilson", "bob@bank.com", "1990-12-10", "AADHAAR345", "Pending", "Submitted"),
    (4, "Alice Johnson", "alice@bank.com", "1975-03-18", "PAN456789", "Active", "Verified"),
    (5, "Charlie Brown", "charlie@bank.com", "1992-07-25", "DL987654", "Rejected", "Failed"),
]

kyc_df = spark.createDataFrame(
    kyc_data,
    ["customer_id", "full_name", "email", "dob", "id_number", "kyc_status", "verification_status"]
)

logger.info(f"Extracted {kyc_df.count()} KYC records")
print(f"Extracted {kyc_df.count()} KYC records")

# STAGE 2: TRANSFORM
print("\nSTAGE 2: TRANSFORM")

kyc_processed = kyc_df.withColumn(
    "dob", to_date(col("dob"), "yyyy-MM-dd")
).withColumn(
    "age", year(current_timestamp()) - year(col("dob"))
).withColumn(
    "name_valid", when(length(col("full_name")) >= 3, 1).otherwise(0)
).withColumn(
    "email_valid", when(col("email").contains("@"), 1).otherwise(0)
).withColumn(
    "id_valid", when(length(col("id_number")) >= 6, 1).otherwise(0)
).withColumn(
    "kyc_compliance_score",
    (col("name_valid") * 25 + col("email_valid") * 25 + col("id_valid") * 50)
).withColumn(
    "compliance_status",
    when(col("kyc_compliance_score") == 100, "Compliant")
    .when(col("kyc_compliance_score") >= 75, "Partial")
    .otherwise("Non-Compliant")
).withColumn(
    "processing_date", current_timestamp()
)

logger.info(f"KYC processing completed for {kyc_processed.count()} customers")
print(f"KYC processing completed: {kyc_processed.count()} customers")

# STAGE 3: LOAD
print("\nSTAGE 3: LOAD")

kyc_summary = kyc_processed.groupBy("kyc_status").agg(
    count("*").alias("customer_count"),
    avg("kyc_compliance_score").alias("avg_compliance_score")
)

logger.info("KYC summary created")
print("\nKYC Summary by Status:")
kyc_summary.show()

# Compliance report
compliance_report = kyc_processed.select(
    "customer_id", "full_name", "kyc_status", "compliance_status"
)

print(f"\nCompliance Status Report: {compliance_report.count()} customers")
compliance_report.show()

print("\n=== PIPELINE STATUS: SUCCESS ===")
logger.info("KYC ETL Pipeline completed successfully")