### Import Libraries and Load Dimensions

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os

# Create Spark Session
spark = SparkSession.builder \
    .appName("Project2 - Streaming ETL Pipeline") \
    .config("spark.sql.warehouse.dir", os.path.abspath("../lakehouse")) \
    .config("spark.sql.streaming.schemaInference", "true") \
    .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print("âœ“ Spark Session created for streaming ETL")
print(f"  Version: {spark.version}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/18 05:00:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/18 05:00:33 WARN SparkSession: Cannot use io.delta.sql.DeltaSparkSessionExtension to configure session extensions.
java.lang.ClassNotFoundException: io.delta.sql.DeltaSparkSessionExtension
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at java.base/java.lang.Class.forName0(Native Method)
	at java.base/java.lang.Class.forName(Class.java:529)
	at java.base/java.lang.Class.forName(Class.java:508)
	at org.apache.spark.util.SparkClassUtils.classForName(SparkClassUtils.scala:42)

âœ“ Spark Session created for streaming ETL
  Version: 4.1.0


### Load All Dimension Tables

In [3]:
import pymongo
import pandas as pd

# MongoDB connection
mongodb_uri = "mongodb+srv://simonalam1234_db_user:DcBwszDL4I7auowi@cluster.axefz7f.mongodb.net/?appName=cluster"
client = pymongo.MongoClient(mongodb_uri)
db = client["fashion_retail_lakehouse"]

print("="*60)
print("LOADING DIMENSIONS FROM MONGODB ATLAS")
print("="*60)

# Load Customers from MongoDB Atlas
print("\nðŸ“Š Loading customers from MongoDB Atlas...")
customers_from_mongo = list(db.customers.find())
df_dim_customers_pandas = pd.DataFrame(customers_from_mongo)

# Remove MongoDB's _id field
if '_id' in df_dim_customers_pandas.columns:
    df_dim_customers_pandas = df_dim_customers_pandas.drop('_id', axis=1)

# Convert to Spark DataFrame
df_dim_customers = spark.createDataFrame(df_dim_customers_pandas)

# Add customer_key
from pyspark.sql.window import Window
window_spec = Window.orderBy("customer_reference_id")
df_dim_customers = df_dim_customers.withColumn("customer_key", row_number().over(window_spec))

# Select and transform
df_dim_customers = df_dim_customers.select(
    col("customer_key"),
    col("customer_reference_id"),
    col("first_name"),
    col("last_name"),
    col("email"),
    col("city"),
    col("age"),
    col("loyalty_tier")
).withColumn(
    "loyalty_member",
    when(col("loyalty_tier").isin("Gold", "Platinum"), "Yes").otherwise("No")
)

print(f"âœ“ Loaded {df_dim_customers.count()} customers from MongoDB Atlas")

# Load Products from MongoDB Atlas
print("\nðŸ“¦ Loading products from MongoDB Atlas...")
products_from_mongo = list(db.products.find())
df_dim_products_pandas = pd.DataFrame(products_from_mongo)

# Remove MongoDB's _id field
if '_id' in df_dim_products_pandas.columns:
    df_dim_products_pandas = df_dim_products_pandas.drop('_id', axis=1)

df_dim_products = spark.createDataFrame(df_dim_products_pandas)
window_spec = Window.orderBy("item_name")
df_dim_products = df_dim_products.withColumn("product_key", row_number().over(window_spec))

df_dim_products = df_dim_products.select(
    col("product_key"),
    col("item_name"),
    col("category"),
    col("brand"),
    col("material"),
    col("season"),
    col("gender_target"),
    col("base_price"),
    col("stock_quantity")
)

print(f"âœ“ Loaded {df_dim_products.count()} products from MongoDB Atlas")

# Generate Date Dimension (programmatic - same as before)
print("\nðŸ“… Generating date dimension...")
from datetime import datetime, timedelta
dates = []
start_date = datetime(2023, 1, 1)
for i in range(365):
    date = start_date + timedelta(days=i)
    dates.append({
        'date_key': i + 1,
        'purchase_date': date.strftime('%Y-%m-%d'),
        'full_date': date,
        'year': date.year,
        'month': date.month,
        'day': date.day,
        'quarter': (date.month - 1) // 3 + 1,
        'month_name': date.strftime('%B'),
        'day_name': date.strftime('%A')
    })
df_dim_date = spark.createDataFrame(dates)

print(f"âœ“ Generated {df_dim_date.count()} date records")

# Create Payment Dimension (in-memory - same as before)
print("\nðŸ’³ Creating payment dimension...")
payment_data = [
    {'payment_key': 1, 'payment_method': 'Credit Card'},
    {'payment_key': 2, 'payment_method': 'PayPal'},
    {'payment_key': 3, 'payment_method': 'Cash'},
    {'payment_key': 4, 'payment_method': 'Debit Card'}
]
df_dim_payment = spark.createDataFrame(payment_data)

print(f"âœ“ Created {df_dim_payment.count()} payment methods")

print("\n" + "="*60)
print("DIMENSION TABLES LOADED")
print("="*60)
print(f"âœ“ Customers (MongoDB): {df_dim_customers.count()} records")
print(f"âœ“ Products (MongoDB): {df_dim_products.count()} records")
print(f"âœ“ Dates (Generated): {df_dim_date.count()} records")
print(f"âœ“ Payment Methods: {df_dim_payment.count()} records")
print("="*60)

LOADING DIMENSIONS FROM MONGODB ATLAS

ðŸ“Š Loading customers from MongoDB Atlas...


                                                                                

âœ“ Loaded 166 customers from MongoDB Atlas

ðŸ“¦ Loading products from MongoDB Atlas...
âœ“ Loaded 61 products from MongoDB Atlas

ðŸ“… Generating date dimension...
âœ“ Generated 365 date records

ðŸ’³ Creating payment dimension...
âœ“ Created 4 payment methods

DIMENSION TABLES LOADED
âœ“ Customers (MongoDB): 166 records
âœ“ Products (MongoDB): 61 records
âœ“ Dates (Generated): 365 records
âœ“ Payment Methods: 4 records


### Bronze Layer - Read Streaming Sales Data

In [4]:
# Define output paths
bronze_output = "../lakehouse/sales/bronze"
bronze_checkpoint = os.path.join(bronze_output, "_checkpoint")

# Read streaming sales transactions from JSON files
df_sales_bronze = (
    spark.readStream \
    .option("maxFilesPerTrigger", 1) \
    .option("multiLine", "true") \
    .json("../streaming/sales/")
    .withColumn("receipt_time", current_timestamp())
    .withColumn("source_file", input_file_name())
)

print("âœ“ Bronze streaming query defined")
print(f"  Source: ../streaming/sales/")
print(f"  Output: {bronze_output}")
print(f"  Is Streaming: {df_sales_bronze.isStreaming}")

âœ“ Bronze streaming query defined
  Source: ../streaming/sales/
  Output: ../lakehouse/sales/bronze
  Is Streaming: True


In [5]:
import shutil

# Stop all active queries
for query in spark.streams.active:
    try:
        query.stop()
        print(f"âœ“ Stopped query: {query.name}")
    except:
        pass

# Define paths
silver_output = "../lakehouse/sales/silver"

# DELETE bronze and silver directories completely
for path in [bronze_output, silver_output]:
    if os.path.exists(path):
        shutil.rmtree(path)
        print(f"âœ“ Deleted: {path}")

print("\nâœ… Ready to reprocess ALL 10 JSON files")

âœ“ Deleted: ../lakehouse/sales/bronze
âœ“ Deleted: ../lakehouse/sales/silver

âœ… Ready to reprocess ALL 10 JSON files


### Write Bronze Layer to Parquet

In [6]:
# Clean up checkpoint
import shutil
if os.path.exists(bronze_checkpoint):
    shutil.rmtree(bronze_checkpoint)
    print("âœ“ Cleaned old checkpoint")

# Define explicit schema
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType

schema = StructType([
    StructField("Customer Reference ID", LongType(), True),
    StructField("Date Purchase", StringType(), True),
    StructField("Item Purchased", StringType(), True),
    StructField("Payment Method", StringType(), True),
    StructField("Purchase Amount (USD)", DoubleType(), True),
    StructField("Review Rating", DoubleType(), True)
])

# Read JSON files (line-delimited, NOT multiLine)
df_sales_bronze = (
    spark.read \
    .schema(schema) \
    .json("../streaming/sales/") \
    .withColumn("receipt_time", current_timestamp()) \
    .withColumn("source_file", input_file_name())
)

print(f"âœ“ Read {df_sales_bronze.count()} records from JSON files")

# Write to bronze as parquet
df_sales_bronze.write \
    .mode("overwrite") \
    .parquet(bronze_output)

print(f"âœ… Bronze layer complete!")

# Verify
df_verify = spark.read.parquet(bronze_output)
print(f"âœ“ Total records in bronze: {df_verify.count()}")

âœ“ Read 3400 records from JSON files


25/12/18 05:00:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/12/18 05:00:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/12/18 05:00:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers


âœ… Bronze layer complete!
âœ“ Total records in bronze: 3400


25/12/18 05:00:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/12/18 05:00:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


In [7]:
import pandas as pd

# Check how many records are in each JSON file
total = 0
for i in range(1, 11):
    df = pd.read_json(f"../streaming/sales/sales_{i:03d}.json", lines=True)
    print(f"sales_{i:03d}.json: {len(df)} records")
    total += df.shape[0]
    
print(f"\nTotal records in all files: {total}")

sales_001.json: 340 records
sales_002.json: 340 records
sales_003.json: 340 records
sales_004.json: 340 records
sales_005.json: 340 records
sales_006.json: 340 records
sales_007.json: 340 records
sales_008.json: 340 records
sales_009.json: 340 records
sales_010.json: 340 records

Total records in all files: 3400


### Silver Layer - Join with Dimensions

In [8]:
# Define silver paths
silver_output = "../lakehouse/sales/silver"

# Read from bronze (batch) and join with dimensions
df_sales_silver = spark.read.parquet(bronze_output) \
    .join(df_dim_customers, 
          col("Customer Reference ID") == df_dim_customers.customer_reference_id, 
          "left_outer") \
    .join(df_dim_products, 
          col("Item Purchased") == df_dim_products.item_name, 
          "left_outer") \
    .join(df_dim_payment, 
          col("Payment Method") == df_dim_payment.payment_method, 
          "left_outer") \
    .join(df_dim_date, 
          to_date(col("Date Purchase")) == df_dim_date.full_date, 
          "left_outer") \
    .select(
        col("customer_key"),
        col("product_key"),
        col("payment_key"),
        col("date_key"),
        col("Purchase Amount (USD)").alias("purchase_amount"),
        col("Review Rating").alias("review_rating"),
        col("receipt_time"),
        col("source_file")
    )

print(f"âœ“ Silver transformations defined")
print(f"  Records before write: {df_sales_silver.count()}")

# Write to silver as parquet
df_sales_silver.write \
    .mode("overwrite") \
    .parquet(silver_output)

print(f"\nâœ… Silver layer complete!")

# Verify
df_silver_verify = spark.read.parquet(silver_output)
print(f"âœ“ Total records in silver: {df_silver_verify.count()}")

# Check NULL foreign keys
null_counts = df_silver_verify.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in ['customer_key', 'product_key', 'payment_key', 'date_key']]
)
print("\nNULL foreign key counts:")
null_counts.show()

âœ“ Silver transformations defined
  Records before write: 3400


25/12/18 05:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 0


âœ… Silver layer complete!
âœ“ Total records in silver: 3400

NULL foreign key counts:
+------------+-----------+-----------+--------+
|customer_key|product_key|payment_key|date_key|
+------------+-----------+-----------+--------+
|           0|          0|          0|     853|
+------------+-----------+-----------+--------+



### Verify Silver Layer Data

In [9]:
# Read silver data to verify
df_silver_check = spark.read.parquet(silver_output)

print(f"âœ“ Silver layer records: {df_silver_check.count()}")
print(f"âœ“ Columns: {df_silver_check.columns}")
print("\nSample silver records:")
df_silver_check.show(10)

# Check for NULL foreign keys
null_counts = df_silver_check.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in ['customer_key', 'product_key', 'payment_key', 'date_key']]
)
print("\nNULL foreign key counts:")
null_counts.show()

âœ“ Silver layer records: 3400
âœ“ Columns: ['customer_key', 'product_key', 'payment_key', 'date_key', 'purchase_amount', 'review_rating', 'receipt_time', 'source_file']

Sample silver records:
+------------+-----------+-----------+--------+---------------+-------------+--------------------+--------------------+
|customer_key|product_key|payment_key|date_key|purchase_amount|review_rating|        receipt_time|         source_file|
+------------+-----------+-----------+--------+---------------+-------------+--------------------+--------------------+
|         116|         33|          1|    NULL|          170.0|          3.6|2025-12-18 05:00:...|file:///Users/sim...|
|         116|         28|          3|    NULL|           NULL|          1.2|2025-12-18 05:00:...|file:///Users/sim...|
|         161|         23|          3|     106|          162.0|          1.1|2025-12-18 05:00:...|file:///Users/sim...|
|         152|         41|          1|      78|         2356.0|          4.8|2025-12-1

### Gold Layer 1: Category Performance by Quarter

In [10]:
# Read silver (batch) and create gold aggregation
df_sales_gold_category = spark.read.parquet(silver_output) \
    .join(df_dim_products, "product_key") \
    .join(df_dim_date, "date_key") \
    .groupBy("category", "quarter", "year") \
    .agg(
        count("customer_key").alias("num_transactions"),
        sum("purchase_amount").alias("total_revenue"),
        avg("purchase_amount").alias("avg_transaction"),
        avg("review_rating").alias("avg_rating")
    ) \
    .orderBy(desc("total_revenue"))

print(f"âœ“ Gold layer (Category) records: {df_sales_gold_category.count()}")
print("\nTop 10 Category Performance by Revenue:")
df_sales_gold_category.show(10, truncate=False)

print("\nâœ… Gold layer (Category) complete!")

25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 0

âœ“ Gold layer (Category) records: 31

Top 10 Category Performance by Revenue:
+-----------+-------+----+----------------+-------------+------------------+------------------+
|category   |quarter|year|num_transactions|total_revenue|avg_transaction   |avg_rating        |
+-----------+-------+----+----------------+-------------+------------------+------------------+
|Accessories|2      |2023|213             |28520.0      |161.12994350282486|3.081218274111675 |
|Accessories|1      |2023|222             |26611.0      |147.02209944751382|3.0898989898989897|
|Tops       |1      |2023|200             |25895.0      |156.93939393939394|2.773157894736845 |
|Tops       |2      |2023|191             |23882.0      |155.07792207792207|3.0110465116279075|
|Accessories|3      |2023|216             |23048.0      |130.95454545454547|3.0393782383419676|
|Bottoms    |2      |2023|141             |20429.0      |185.71818181818182|3.079069767441861 |
|Tops       |3      |2023|194             |18531.0      |

25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 0

### Gold Layer 2: Customer Loyalty Analysis

In [11]:
# Create second gold aggregation: Customer Loyalty Performance
df_sales_gold_loyalty = spark.read.parquet(silver_output) \
    .join(df_dim_customers, "customer_key") \
    .join(df_dim_products, "product_key") \
    .groupBy("loyalty_member", "category") \
    .agg(
        count("customer_key").alias("num_purchases"),
        sum("purchase_amount").alias("total_spent"),
        avg("purchase_amount").alias("avg_purchase"),
        approx_count_distinct("customer_key").alias("unique_customers")
    ) \
    .orderBy(desc("total_spent"))

print(f"âœ“ Gold layer (Loyalty) records: {df_sales_gold_loyalty.count()}")
print("\nTop 10 Customer Loyalty Performance:")
df_sales_gold_loyalty.show(10, truncate=False)

print("\nâœ… Gold layer (Loyalty) complete!")

25/12/18 05:00:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 0

âœ“ Gold layer (Loyalty) records: 18

Top 10 Customer Loyalty Performance:
+--------------+-----------+-------------+-----------+------------------+----------------+
|loyalty_member|category   |num_purchases|total_spent|avg_purchase      |unique_customers|
+--------------+-----------+-------------+-----------+------------------+----------------+
|Yes           |Accessories|471          |63996.0    |168.85488126649076|91              |
|Yes           |Tops       |424          |56737.0    |164.45507246376812|93              |
|No            |Tops       |352          |40376.0    |141.6701754385965 |76              |
|No            |Accessories|397          |38852.0    |115.97611940298508|78              |
|Yes           |Bottoms    |264          |33513.0    |151.64253393665157|89              |
|Yes           |Outerwear  |238          |31071.0    |161.828125        |91              |
|No            |Footwear   |166          |29247.0    |219.90225563909775|67              |
|No            

25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 0

### Pipeline Summary

In [12]:
print("="*80)
print("STREAMING ETL PIPELINE COMPLETE")
print("="*80)
print(f"âœ“ Bronze Layer: {spark.read.parquet(bronze_output).count()} records")
print(f"âœ“ Silver Layer: {spark.read.parquet(silver_output).count()} records")
print(f"âœ“ Gold Layer (Category): {df_sales_gold_category.count()} aggregates")
print(f"âœ“ Gold Layer (Loyalty): {df_sales_gold_loyalty.count()} aggregates")
print("="*80)
print("\nâœ… Medallion Architecture Successfully Implemented!")
print("   - Bronze: Raw batch data with metadata (3,400 records)")
print("   - Silver: Cleaned and joined with dimensions (3,400 records)")
print("   - Gold: Business-level aggregations (31 + 18 aggregates)")
print("\nðŸ“Š Key Insights:")
print(f"   - Top Category: Accessories (Q2 2023) - ${28520:.2f} revenue")
print(f"   - Loyalty Members spend more: ~$165 avg vs ~$140 avg")
print(f"   - 853 records with NULL dates (25% - date mismatches)")

STREAMING ETL PIPELINE COMPLETE
âœ“ Bronze Layer: 3400 records
âœ“ Silver Layer: 3400 records


25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 0

âœ“ Gold Layer (Category): 31 aggregates
âœ“ Gold Layer (Loyalty): 18 aggregates

âœ… Medallion Architecture Successfully Implemented!
   - Bronze: Raw batch data with metadata (3,400 records)
   - Silver: Cleaned and joined with dimensions (3,400 records)
   - Gold: Business-level aggregations (31 + 18 aggregates)

ðŸ“Š Key Insights:
   - Top Category: Accessories (Q2 2023) - $28520.00 revenue
   - Loyalty Members spend more: ~$165 avg vs ~$140 avg
   - 853 records with NULL dates (25% - date mismatches)


25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/18 05:00:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
