In [0]:
# ================================================
# E-commerce Sales Analytics ETL Pipeline
# ================================================

from pyspark.sql.functions import (
    col, when, sum as spark_sum, count, avg,
    max as spark_max, min as spark_min, round,
    current_timestamp, to_date, year, month
)
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Starting E-commerce Sales Analytics ETL")
print("\n=== E-COMMERCE SALES ANALYTICS ETL ===")

# EXTRACT
print("\nSTAGE 1: EXTRACT")

orders_data = [
    (101, 1001, "2024-01-15", 250, "Electronics", "Completed"),
    (102, 1002, "2024-01-16", 500, "Electronics", "Completed"),
    (103, 1001, "2024-01-17", 150, "Clothing", "Pending"),
    (104, 1003, "2024-01-18", 800, "Home", "Completed"),
    (105, 1002, "2024-01-19", 300, "Clothing", "Shipped"),
]

orders_df = spark.createDataFrame(
    orders_data,
    ["order_id", "customer_id", "order_date", "amount", "category", "status"]
)

logger.info(f"Extracted {orders_df.count()} orders")
print(f"Extracted {orders_df.count()} orders")

# TRANSFORM
print("\nSTAGE 2: TRANSFORM")

sales_df = orders_df.withColumn(
    "order_date", to_date(col("order_date"), "yyyy-MM-dd")
).withColumn(
    "year", year(col("order_date"))
).withColumn(
    "month", month(col("order_date"))
).withColumn(
    "processing_date", current_timestamp()
)

logger.info(f"Transformed {sales_df.count()} sales records")
print(f"Transformed {sales_df.count()} sales records")

# LOAD
print("\nSTAGE 3: LOAD")

# Sales by category
category_sales = sales_df.groupBy("category").agg(
    count("*").alias("num_orders"),
    spark_sum("amount").alias("total_sales"),
    avg("amount").alias("avg_order_value")
)

logger.info("Category sales report created")
print("\nSales by Category:")
category_sales.show()

# Customer metrics
customer_metrics = sales_df.groupBy("customer_id").agg(
    count("*").alias("num_purchases"),
    spark_sum("amount").alias("lifetime_value")
)

logger.info(f"Customer metrics: {customer_metrics.count()} unique customers")
print("\nCustomer Lifetime Value:")
customer_metrics.show()

print("\n=== PIPELINE STATUS: SUCCESS ===")
logger.info("E-commerce ETL Pipeline completed successfully")