# **Step 1: Generate Silver Layer Data with Faker (Simulating Orders)**
Using Python with PySpark Faker (or similar) to create silver cleaned orders including order date in 2 years for YOY metrics:

In [0]:
pip install faker

In [0]:
from faker import Faker
import random
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType

# Initialize Faker and Spark session
faker = Faker()
spark = SparkSession.builder.appName("GenerateSilverData").getOrCreate()

# Create schema in Databricks if not exists
spark.sql("CREATE SCHEMA IF NOT EXISTS lakehouse_demo.silver")

# Generate fake orders for current and previous year
def generate_fake_orders(num_records=1000):
    orders = []
    for _ in range(num_records):
        order_id = faker.uuid4()
        order_year = random.choice([2024, 2025])
        start_date = datetime.strptime(f'{order_year}-01-01', '%Y-%m-%d').date()
        end_date = datetime.strptime(f'{order_year}-12-31', '%Y-%m-%d').date()
        order_date = faker.date_between(start_date=start_date, end_date=end_date)
        customer_id = f'cust_{random.randint(1, 100)}'
        item_id = f'item_{random.randint(1, 50)}'
        amount = round(random.uniform(10.0, 500.0), 2)
        orders.append((order_id, order_date, customer_id, item_id, amount))
    return orders

# Define schema for PySpark DataFrame
schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("order_date", DateType(), False),
    StructField("customer_id", StringType(), False),
    StructField("item_id", StringType(), False),
    StructField("amount", DoubleType(), False)
])

# Generate data and create DataFrame
data = generate_fake_orders(1000)
df = spark.createDataFrame(data, schema)

# Write to silver Delta table in the created schema
df.write.format("delta").mode("overwrite").saveAsTable("lakehouse_demo.silver.orders_cleaned")

# Step 2: Create Gold Layer Aggregations
Create aggregated gold customer spending table that includes year-based counts for YOY metrics:

In [0]:
%sql
-- Create schema if not exists
CREATE SCHEMA IF NOT EXISTS lakehouse_demo.gold;

-- Create or replace dimension table for customers
CREATE OR REPLACE TABLE lakehouse_demo.gold.dim_customer AS
SELECT DISTINCT
  customer_id,
  customer_id AS customer_key -- surrogate key, can be enhanced as needed
FROM lakehouse_demo.silver.orders_cleaned;

-- Create or replace fact table for orders
CREATE OR REPLACE TABLE lakehouse_demo.gold.fact_customer_orders AS
SELECT
  customer_id AS customer_key,
  order_id,
  order_date,
  amount
FROM lakehouse_demo.silver.orders_cleaned;

# Step 3: Create the Metric View using this YML
-- Create a metric view to centralize and standardize customer spending KPIs with year-over-year growth calculation

In [0]:
# version: 0.1
# source: lakehouse_demo.gold.fact_customer_orders
#
# dimensions:
#   - name: Customer Key
#     expr: customer_key
#   - name: Order Year
#     expr: YEAR(order_date)
#
# measures:
#   - name: Total Orders
#     expr: COUNT(DISTINCT order_id)
#
#   - name: Total Spent
#     expr: SUM(amount)
#
#   - name: Total Orders Current Year
#     expr: COUNT(DISTINCT CASE WHEN YEAR(order_date) = YEAR(CURRENT_DATE()) THEN order_id END)
#
#   - name: Total Orders Last Year
#     expr: COUNT(DISTINCT CASE WHEN YEAR(order_date) = YEAR(CURRENT_DATE()) - 1 THEN order_id END)
#
#   - name: YoY Order Growth Pct
#     expr: |
#       CASE 
#         WHEN MEASURE(`Total Orders Last Year`) = 0 THEN NULL
#         ELSE ((MEASURE(`Total Orders Current Year`) * 1.0 / MEASURE(`Total Orders Last Year`)) - 1) * 100
#       END

# Step 4: Querying the Metric View
Querying metric views ensures consistent, centralized business metrics by abstracting complex calculations into a reusable layer, reducing duplication and errors. Metric views can also be used across dashboards, notebooks, and AI tools, enabling flexible, governed analytics and natural language exploration for business users.

In [0]:
%sql
SELECT
  `Customer Key`,
  MEASURE(`Total Orders`) AS total_orders,
  MEASURE(`Total Spent`) AS total_spent,
  MEASURE(`Total Orders Current Year`) AS total_orders_current_year,
  MEASURE(`Total Orders Last Year`) AS total_orders_last_year,
  MEASURE(`YoY Order Growth Pct`) AS yoy_growth_pct
FROM lakehouse_demo.gold.customer_spending_metrics
GROUP BY ALL
ORDER BY yoy_growth_pct DESC
LIMIT 20;

# Step 5: Visualize the data with Pandas and Python

In [0]:
# _sqldf contains the DataFrame from the last SQL cell
pandas_df = _sqldf.toPandas()

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))

# Horizontal bar plot for YoY Order Growth %
sns.barplot(data=pandas_df, y='Customer Key', x='yoy_growth_pct', palette='viridis')

plt.title("Top 20 Customers by YoY Order Growth %")
plt.xlabel("Year-over-Year Order Growth (%)")
plt.ylabel("Customer Key")
plt.tight_layout()
plt.show()


In [0]:
%sql
DROP VIEW IF EXISTS lakehouse_demo.gold.customer_spending_metrics;