In [0]:
# Healthcare Claims Processing ETL Pipeline
from pyspark.sql.functions import (
    col, when, sum as spark_sum, count, avg,
    current_timestamp, to_date
)
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Starting Healthcare Claims ETL Pipeline")
print("\n=== HEALTHCARE CLAIMS PROCESSING ETL ===")

# EXTRACT
print("\nSTAGE 1: EXTRACT")

claims_data = [
    ("C001", "P001", "2024-01-15", 5000, "Approved", "Processed"),
    ("C002", "P002", "2024-01-16", 12000, "Approved", "Processed"),
    ("C003", "P001", "2024-01-17", 8000, "Pending", "Under Review"),
    ("C004", "P003", "2024-01-18", 3500, "Denied", "Rejected"),
    ("C005", "P002", "2024-01-19", 15000, "Approved", "Processed"),
]

claims_df = spark.createDataFrame(
    claims_data,
    ["claim_id", "patient_id", "claim_date", "claim_amount", "status", "processing_status"]
)

logger.info(f"Extracted {claims_df.count()} claims")
print(f"Extracted {claims_df.count()} insurance claims")

# TRANSFORM
print("\nSTAGE 2: TRANSFORM")

claims_processed = claims_df.withColumn(
    "claim_date", to_date(col("claim_date"), "yyyy-MM-dd")
).withColumn(
    "processing_date", current_timestamp()
).withColumn(
    "approval_score",
    when(col("status") == "Approved", 100)
    .when(col("status") == "Pending", 50)
    .otherwise(0)
).withColumn(
    "claim_category",
    when(col("claim_amount") > 10000, "High Value")
    .when(col("claim_amount") > 5000, "Medium Value")
    .otherwise("Low Value")
)

logger.info(f"Transformed {claims_processed.count()} claims")
print(f"Transformed {claims_processed.count()} claims")

# LOAD
print("\nSTAGE 3: LOAD")

# Claims by status
status_summary = claims_processed.groupBy("status").agg(
    count("*").alias("num_claims"),
    spark_sum("claim_amount").alias("total_amount"),
    avg("claim_amount").alias("avg_claim")
)

logger.info("Claims summary created")
print("\nClaims by Status:")
status_summary.show()

# Claims by category
category_summary = claims_processed.groupBy("claim_category").agg(
    count("*").alias("num_claims"),
    spark_sum("claim_amount").alias("total_amount")
)

print("\nClaims by Category:")
category_summary.show()

print("\n=== PIPELINE STATUS: SUCCESS ===")
logger.info("Healthcare Claims ETL Pipeline completed successfully")