In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# Analyze the three silver tables
print("=== ANALYZING SILVER TABLES ===\n")

# 1. Features table
print("1. FEATURES TABLE")
print("-" * 60)
df_features = spark.table("practise.silver.features").filter(col("is_current") == True)
print(f"Total current records: {df_features.count():,}")
print(f"Date range: {df_features.agg(min('Date'), max('Date')).collect()[0]}")
print(f"Stores: {df_features.select('Store').distinct().count()}")
print("\nSample data:")
display(df_features.limit(3))

# 2. Sales table
print("\n2. SALES TABLE")
print("-" * 60)
df_sales = spark.table("practise.silver.sales").filter(col("is_current") == True)
print(f"Total current records: {df_sales.count():,}")
print(f"Date range: {df_sales.agg(min('Date'), max('Date')).collect()[0]}")
print(f"Stores: {df_sales.select('Store').distinct().count()}")
print(f"Departments: {df_sales.select('Dept').distinct().count()}")
print(f"Total Sales: ${df_sales.agg(sum('Weekly_Sales')).collect()[0][0]:,.2f}")
print("\nSample data:")
display(df_sales.limit(3))

# 3. Stores table
print("\n3. STORES TABLE")
print("-" * 60)
df_stores = spark.table("practise.silver.stores").filter(col("is_current") == True)
print(f"Total current records: {df_stores.count():,}")
print(f"Store Types: {df_stores.select('Type').distinct().collect()}")
print("\nSample data:")
display(df_stores.limit(3))

In [0]:
# Create Gold Table 1: Sales Summary by Store
# Aggregates sales data with store information

print("Creating Gold Table 1: Sales Summary by Store...\n")

# Join sales with stores to get store details
gold_sales_by_store = df_sales \
    .join(df_stores, "Store") \
    .groupBy("Store", "Type", "Size") \
    .agg(
        sum("Weekly_Sales").alias("Total_Sales"),
        avg("Weekly_Sales").alias("Avg_Weekly_Sales"),
        count("*").alias("Total_Weeks"),
        countDistinct("Dept").alias("Num_Departments"),
        sum(when(col("IsHoliday") == True, col("Weekly_Sales")).otherwise(0)).alias("Holiday_Sales"),
        sum(when(col("IsHoliday") == False, col("Weekly_Sales")).otherwise(0)).alias("Non_Holiday_Sales")
    ) \
    .withColumn("Sales_Per_SqFt", col("Total_Sales") / col("Size")) \
    .withColumn("Holiday_Sales_Pct", (col("Holiday_Sales") / col("Total_Sales")) * 100) \
    .orderBy(col("Total_Sales").desc())

# Save to gold layer
if not spark.catalog.tableExists("practise.gold.sales_by_store"):
    gold_sales_by_store.write.format("delta").saveAsTable("practise.gold.sales_by_store")
    print("✅ Created: practise.gold.sales_by_store")
else:
    print("⚠️  Table already exists: practise.gold.sales_by_store")
    print("   Run this to recreate: spark.sql('DROP TABLE practise.gold.sales_by_store')")

print(f"Records: {gold_sales_by_store.count():,}\n")
print("Top 5 stores by sales:")
display(gold_sales_by_store.limit(5))

In [0]:
# Create Gold Table 2: Sales Trends Over Time
# Time-based aggregations for trend analysis

print("Creating Gold Table 2: Sales Trends Over Time...\n")

gold_sales_trends = df_sales \
    .withColumn("Year", year("Date")) \
    .withColumn("Month", month("Date")) \
    .withColumn("Week", weekofyear("Date")) \
    .withColumn("Quarter", quarter("Date")) \
    .groupBy("Year", "Quarter", "Month", "Week", "Date", "IsHoliday") \
    .agg(
        sum("Weekly_Sales").alias("Total_Sales"),
        avg("Weekly_Sales").alias("Avg_Sales"),
        count("*").alias("Num_Transactions"),
        countDistinct("Store").alias("Num_Stores"),
        countDistinct("Dept").alias("Num_Departments")
    ) \
    .orderBy("Date")

# Save to gold layer
if not spark.catalog.tableExists("practise.gold.sales_trends"):
    gold_sales_trends.write.format("delta").saveAsTable("practise.gold.sales_trends")
    print("✅ Created: practise.gold.sales_trends")
else:
    print("⚠️  Table already exists: practise.gold.sales_trends")

print(f"Records: {gold_sales_trends.count():,}\n")
print("Sample trends:")
display(gold_sales_trends.limit(5))