In [0]:

from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("ECom Transactions")\
        .getOrCreate()

spark

<pyspark.sql.connect.session.SparkSession at 0xfff9399cc2d0>

In [0]:
#E-Commerce Transactions + Returns + Inventory
# PySpark + Delta
# 1. Ingest all 3 CSVs as Delta Tables.
#CSV files
df_orders = spark.read.csv("/Volumes/workspace/ecommerce/csv_data/orders.csv", header=True, inferSchema=True)
df_customers = spark.read.csv("/Volumes/workspace/ecommerce/csv_data/customers.csv", header=True, inferSchema=True)
df_products = spark.read.csv("/Volumes/workspace/ecommerce/csv_data/products.csv", header=True, inferSchema=True)

# Save as Delta Tables in a database called 'ecommerce'
spark.sql("CREATE DATABASE IF NOT EXISTS ecommerce")

df_orders.write.format("delta").mode("overwrite").saveAsTable("ecommerce.orders")
df_customers.write.format("delta").mode("overwrite").saveAsTable("ecommerce.customers")
df_products.write.format("delta").mode("overwrite").saveAsTable("ecommerce.products")

In [0]:
df_customers.show()
df_orders.show()
df_products.show()  

In [0]:
# 2. Write SQL to get the total revenue per Product.
spark.sql("""
    SELECT 
        ProductID, 
        SUM(Quantity * Price) AS TotalRevenue
    FROM ecommerce.orders
    WHERE Status = 'Delivered'
    GROUP BY ProductID
    ORDER BY TotalRevenue DESC
""").show()

# 3. Join Orders + Customers to find revenue by Region.
spark.sql("""
    SELECT 
        c.Region, 
        SUM(o.Quantity * o.Price) AS RegionalRevenue
    FROM ecommerce.orders o
    JOIN ecommerce.customers c 
        ON o.CustomerID = c.CustomerID
    WHERE o.Status = 'Delivered'
    GROUP BY c.Region
""").show()

# 4. Update the Status of Pending orders to 'Cancelled'.
spark.sql("""
  UPDATE ecommerce.orders
  SET Status = 'Cancelled'
  WHERE Status = 'Pending'
""").show()

# 5. Merge a new return record into Orders.
from pyspark.sql import Row

# Simulated return record
new_return = spark.createDataFrame([
    Row(OrderID=3006, CustomerID='C003', ProductID='P1003', Quantity=1, Price=30000, OrderDate='2024-05-06', Status='Returned')
])

new_return.createOrReplaceTempView("new_return")

spark.sql("""
MERGE INTO ecommerce.orders AS target
USING new_return AS source
ON target.OrderID = source.OrderID
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""").show()

In [0]:
# DLT Pipeline
# 6. Create raw → cleaned → aggregated tables:
# Clean: Remove rows with NULLs
# Aggregated: Total revenue per Category
df_cleaned = spark.sql("SELECT * FROM ecommerce.orders").dropna()
df_cleaned.write.format("delta").mode("overwrite").saveAsTable("ecommerce.orders_cleaned")

spark.sql("""
    SELECT 
        p.Category, 
        SUM(o.Quantity * o.Price) AS TotalRevenue
    FROM ecommerce.orders_cleaned o
    JOIN ecommerce.products p
        ON o.ProductID = p.ProductID
    WHERE o.Status = 'Delivered'
    GROUP BY p.Category
""").show()

In [0]:
# Time Travel
# 7. View data before the Status update.
spark.sql("""
    SELECT * FROM ecommerce.orders VERSION AS OF 0
""").show()

# 8. Restore to an older version of the orders table. 
df_old = spark.read.format("delta").option("versionAsOf", 0).table("ecommerce.orders")
df_old.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("ecommerce.orders")

In [0]:
# Vacuum + Retention
# 9. Run VACUUM after changing default retention.
# Change retention period to 0 hours
# Disable retention check for immediate VACUUM
spark.conf.set("spark.delta.retentionDurationCheck.enabled", "false")
spark.sql("VACUUM ecommerce.orders RETAIN 0 HOURS")

In [0]:
# Expectations
# 10. Quantity > 0 , Price > 0 , OrderDate is not null
spark.sql("""
    SELECT * FROM ecommerce.orders
    WHERE Quantity > 0 AND Price > 0 AND OrderDate IS NOT NULL
""").show()

In [0]:
# Bonus
# 11. Use when-otherwise to create a new column: OrderType = "Return" if Status ==
# 'Returned'
spark.sql("""
    SELECT *, 
        CASE 
            WHEN Status = 'Returned' THEN 'Return'
            ELSE 'Sale'
        END AS OrderType
    FROM ecommerce.orders
""").show()