# Silver to Gold Data Transformation Pipeline

This notebook orchestrates the entire Silver to Gold data transformation process. It executes a series of transformations to create analytical tables in the gold layer.

## Process Overview

1. Customer Metrics Generation
   - Creates detailed customer analytics
   - Includes purchase behavior, segmentation, and lifetime value metrics

2. Product Performance Analysis
   - Analyzes product sales and performance
   - Includes inventory metrics and customer behavior patterns

3. Sales Summary Creation
   - Comprehensive sales analytics
   - Daily aggregations with various dimensions

4. Inventory Insights
   - Stock level analysis
   - Demand patterns and optimization metrics

## Execution Notes

- The notebook uses adaptive query optimization
- Tables are created using the Iceberg table format
- Each section includes progress logging
- Final optimization and statistics computation is performed on all tables


In [0]:
use catalog apjtechup;
use schema gold;

In [0]:
-- Log transformation start
SELECT 'Starting Silver to Gold ELT Process' as status, CURRENT_TIMESTAMP() as start_time;


## 1. Customer Metrics Generation

Running the customer metrics transformation notebook to create customer analytics table.


In [0]:
SELECT 'Creating Customer Metrics...' as step, CURRENT_TIMESTAMP() as timestamp

In [0]:
%run ./01_silver_to_gold_customer_metrics

## 2. Product Performance Analysis

Running the product performance transformation notebook to analyze product metrics.


In [0]:
SELECT 'Creating Product Performance...' as step, CURRENT_TIMESTAMP() as timestamp;


In [0]:
%run ./02_silver_to_gold_product_performance


## 3. Sales Summary Creation

Creating a comprehensive sales summary table with daily aggregations and various business metrics.


In [0]:
SELECT 'Creating Sales Summary...' as step, CURRENT_TIMESTAMP() as timestamp;

CREATE OR REPLACE TABLE sales_summary USING ICEBERG
WITH daily_sales AS (
    SELECT 
        o.order_date_only as date_key,
        YEAR(o.order_date_only) as year,
        QUARTER(o.order_date_only) as quarter,
        MONTH(o.order_date_only) as month,
        WEEKOFYEAR(o.order_date_only) as week_of_year,
        CASE DAYOFWEEK(o.order_date_only)
            WHEN 1 THEN 'Sunday'
            WHEN 2 THEN 'Monday'
            WHEN 3 THEN 'Tuesday'
            WHEN 4 THEN 'Wednesday'
            WHEN 5 THEN 'Thursday'
            WHEN 6 THEN 'Friday'
            WHEN 7 THEN 'Saturday'
        END as day_of_week,
        DAY(o.order_date_only) as day_of_month,
        CASE WHEN DAYOFWEEK(o.order_date_only) IN (1, 7) THEN TRUE ELSE FALSE END as is_weekend,
        FALSE as is_holiday, -- Placeholder
        
        -- Geographic dimensions (simplified)
        CASE 
            WHEN c.state IN ('CA', 'WA', 'OR', 'NV', 'AZ') THEN 'West'
            WHEN c.state IN ('TX', 'OK', 'AR', 'LA', 'NM') THEN 'South'
            WHEN c.state IN ('NY', 'NJ', 'CT', 'MA', 'PA') THEN 'Northeast'
            ELSE 'Other'
        END as region,
        c.state,
        
        -- Product dimensions
        oi.category_name,
        oi.brand,
        
        -- Customer dimensions
        c.customer_tier,
        CASE 
            WHEN cm.customer_segment IS NOT NULL THEN cm.customer_segment
            ELSE 'Unknown'
        END as customer_segment,
        
        -- Metrics
        COUNT(DISTINCT o.order_id) as total_orders,
        COUNT(DISTINCT o.customer_id) as total_customers,
        SUM(o.total_amount) as gross_revenue,
        SUM(o.net_amount) as net_revenue,
        SUM(o.discount_amount) as total_discounts,
        SUM(o.shipping_cost) as shipping_revenue,
        SUM(o.tax_amount) as tax_revenue,
        SUM(oi.quantity) as total_units_sold,
        SUM(oi.total_amount - (oi.unit_cost * oi.quantity)) as total_profit,
        SUM(oi.unit_cost * oi.quantity) as total_cost
    FROM apjtechup.silver.orders_clean o
    JOIN apjtechup.silver.order_items_clean oi ON o.order_id = oi.order_id
    JOIN apjtechup.silver.customers_clean c ON o.customer_id = c.customer_id
    LEFT JOIN apjtechup.gold.customer_metrics cm ON o.customer_id = cm.customer_id
    GROUP BY 
        o.order_date_only, c.state, oi.category_name, oi.brand, 
        c.customer_tier, cm.customer_segment
),
new_customers AS (
    SELECT 
        first_order_date as date_key,
        COUNT(*) as new_customers
    FROM (
        SELECT 
            customer_id,
            MIN(order_date_only) as first_order_date
        FROM apjtechup.silver.orders_clean
        GROUP BY customer_id
    ) first_orders
    GROUP BY first_order_date
),
growth_metrics AS (
    SELECT 
        date_key,
        gross_revenue,
        LAG(gross_revenue, 7) OVER (ORDER BY date_key) as revenue_week_ago,
        LAG(gross_revenue, 30) OVER (ORDER BY date_key) as revenue_month_ago,
        LAG(gross_revenue, 365) OVER (ORDER BY date_key) as revenue_year_ago
    FROM (
        SELECT 
            date_key,
            SUM(gross_revenue) as gross_revenue
        FROM daily_sales
        GROUP BY date_key
    ) daily_totals
)
SELECT 
    ds.date_key,
    ds.year,
    ds.quarter,
    ds.month,
    ds.week_of_year,
    ds.day_of_week,
    ds.day_of_month,
    ds.is_weekend,
    ds.is_holiday,
    ds.region,
    ds.state,
    ds.category_name,
    ds.brand,
    ds.customer_tier,
    ds.customer_segment,
    
    -- Sales Metrics
    ds.total_orders,
    ds.total_customers,
    COALESCE(nc.new_customers, 0) as new_customers,
    ds.total_customers - COALESCE(nc.new_customers, 0) as returning_customers,
    
    -- Revenue Metrics
    ROUND(ds.gross_revenue, 2) as gross_revenue,
    ROUND(ds.net_revenue, 2) as net_revenue,
    ROUND(ds.total_discounts, 2) as total_discounts,
    ROUND(ds.shipping_revenue, 2) as shipping_revenue,
    ROUND(ds.tax_revenue, 2) as tax_revenue,
    
    -- Unit Metrics
    ds.total_units_sold,
    ROUND(ds.gross_revenue / GREATEST(ds.total_orders, 1), 2) as average_order_value,
    ROUND(ds.total_units_sold / GREATEST(ds.total_orders, 1), 2) as average_units_per_order,
    
    -- Profitability
    ROUND(ds.total_cost, 2) as total_cost,
    ROUND(ds.total_profit, 2) as gross_profit,
    ROUND(ds.total_profit / GREATEST(ds.gross_revenue, 1) * 100, 2) as gross_margin_percentage,
    
    -- Growth Metrics
    CASE 
        WHEN gm.revenue_week_ago > 0 
        THEN ROUND((ds.gross_revenue - gm.revenue_week_ago) / gm.revenue_week_ago * 100, 2)
        ELSE 0
    END as revenue_growth_wow,
    CASE 
        WHEN gm.revenue_month_ago > 0 
        THEN ROUND((ds.gross_revenue - gm.revenue_month_ago) / gm.revenue_month_ago * 100, 2)
        ELSE 0
    END as revenue_growth_mom,
    CASE 
        WHEN gm.revenue_year_ago > 0 
        THEN ROUND((ds.gross_revenue - gm.revenue_year_ago) / gm.revenue_year_ago * 100, 2)
        ELSE 0
    END as revenue_growth_yoy,
    0.0 as customer_growth_rate, -- Placeholder
    
    -- Operational Metrics (placeholders)
    95.0 as order_fulfillment_rate,
    2.5 as average_shipping_days,
    5.0 as return_rate,
    
    -- Digital Metrics (placeholders)
    3.5 as web_conversion_rate,
    45.0 as mobile_percentage,
    
    -- Metadata
    CURRENT_TIMESTAMP() as created_at,
    CURRENT_TIMESTAMP() as updated_at
FROM daily_sales ds
LEFT JOIN new_customers nc ON ds.date_key = nc.date_key
LEFT JOIN growth_metrics gm ON ds.date_key = gm.date_key;


## 4. Inventory Insights

Creating inventory insights table with stock analysis and demand patterns.


In [0]:
SELECT 'Creating Inventory Insights...' as step, CURRENT_TIMESTAMP() as timestamp;

CREATE OR REPLACE TABLE inventory_insights using iceberg
WITH demand_analysis AS (
    SELECT 
        oi.product_id,
        SUM(CASE WHEN o.order_date_only >= CURRENT_DATE() - INTERVAL 7 DAYS THEN oi.quantity ELSE 0 END) as demand_7d,
        SUM(CASE WHEN o.order_date_only >= CURRENT_DATE() - INTERVAL 30 DAYS THEN oi.quantity ELSE 0 END) as demand_30d,
        SUM(CASE WHEN o.order_date_only >= CURRENT_DATE() - INTERVAL 90 DAYS THEN oi.quantity ELSE 0 END) as demand_90d,
        AVG(oi.quantity) as avg_daily_demand,
        STDDEV(oi.quantity) as demand_stddev
    FROM apjtechup.silver.order_items_clean oi
    JOIN apjtechup.silver.orders_clean o ON oi.order_id = o.order_id
    GROUP BY oi.product_id
),
inventory_financial AS (
    SELECT 
        i.product_id,
        i.warehouse_id,
        i.quantity_available * p.cost as inventory_value,
        i.quantity_available * p.cost * 0.02 as carrying_cost_monthly -- 2% monthly carrying cost
    FROM apjtechup.silver.inventory_clean i
    JOIN apjtechup.silver.products_clean p ON i.product_id = p.product_id
)
SELECT 
    i.product_id,
    i.product_name,
    i.category_name,
    p.brand,
    p.supplier_name,
    i.warehouse_id,
    i.warehouse_region,
    
    -- Current State
    i.quantity_on_hand as current_stock_level,
    i.quantity_reserved as reserved_quantity,
    i.quantity_available as available_quantity,
    i.reorder_level,
    
    -- Demand Patterns
    COALESCE(da.demand_7d, 0) as demand_last_7d,
    COALESCE(da.demand_30d, 0) as demand_last_30d,
    COALESCE(da.demand_90d, 0) as demand_last_90d,
    ROUND(COALESCE(da.avg_daily_demand, 0), 2) as average_daily_demand,
    ROUND(COALESCE(da.demand_stddev, 0), 2) as demand_volatility,
    
    -- Supply Metrics (placeholders with realistic values)
    FLOOR(RANDOM() * 14) + 7 as lead_time_days, -- 7-21 days
    ROUND(RANDOM() * 0.3 + 0.7, 2) as supplier_reliability_score, -- 0.7-1.0
    p.cost as average_cost_per_unit,
    
    -- Performance Metrics
    CASE 
        WHEN i.quantity_available > 0 AND da.avg_daily_demand > 0
        THEN ROUND(COALESCE(da.demand_90d, 0) / GREATEST(i.quantity_available, 1), 2)
        ELSE 0
    END as inventory_turnover_ratio,
    CASE 
        WHEN da.avg_daily_demand > 0
        THEN FLOOR(i.quantity_available / GREATEST(da.avg_daily_demand, 1))
        ELSE 999
    END as days_supply_current,
    CASE 
        WHEN i.stock_status = 'Out of Stock' THEN 1.0
        WHEN i.stock_status = 'Low Stock' THEN 0.7
        ELSE 0.1
    END as stockout_risk_score,
    CASE 
        WHEN i.quantity_available >= i.reorder_level * 5 THEN 0.8
        WHEN i.quantity_available >= i.reorder_level * 3 THEN 0.5
        ELSE 0.1
    END as overstock_risk_score,
    
    -- Financial Impact
    ROUND(COALESCE(if.inventory_value, 0), 2) as inventory_value,
    ROUND(COALESCE(if.carrying_cost_monthly, 0), 2) as carrying_cost_monthly,
    ROUND(CASE WHEN i.stock_status = 'Out of Stock' THEN p.price * 10 ELSE 0 END, 2) as stockout_cost_risk,
    ROUND(CASE WHEN i.overstocked THEN if.inventory_value * 0.1 ELSE 0 END, 2) as excess_inventory_cost,
    
    -- Optimization Recommendations
    GREATEST(ROUND(da.avg_daily_demand * 30), i.reorder_level) as optimal_stock_level,
    ROUND(da.avg_daily_demand * 14) as recommended_order_quantity, -- 2 weeks supply
    ROUND(da.avg_daily_demand * 7) as reorder_point_suggestion, -- 1 week supply
    CASE 
        WHEN COALESCE(da.demand_90d, 0) >= 1000 THEN 'A'
        WHEN COALESCE(da.demand_90d, 0) >= 100 THEN 'B'
        ELSE 'C'
    END as classification,
    
    -- Forecasting (simplified)
    ROUND(da.avg_daily_demand * 7) as demand_forecast_7d,
    ROUND(da.avg_daily_demand * 30) as demand_forecast_30d,
    ROUND(da.avg_daily_demand * 90) as demand_forecast_90d,
    ROUND(RANDOM() * 0.3 + 0.7, 2) as forecast_accuracy_score, -- 0.7-1.0
    
    -- Status Flags
    i.needs_reorder,
    i.overstocked,
    CASE WHEN COALESCE(da.demand_90d, 0) < 10 THEN TRUE ELSE FALSE END as is_slow_moving,
    CASE WHEN p.product_lifecycle_stage = 'Discontinued' THEN TRUE ELSE FALSE END as is_obsolete_risk,
    
    -- Seasonal Patterns (placeholders)
    ROUND(RANDOM() * 0.4 + 0.8, 2) as seasonality_factor, -- 0.8-1.2
    'Q4' as peak_season_months,
    'Moderate' as seasonal_demand_pattern,
    
    -- Metadata
    CURRENT_DATE() as snapshot_date,
    CURRENT_TIMESTAMP() as created_at,
    CURRENT_TIMESTAMP() as updated_at
FROM apjtechup.silver.inventory_clean i
JOIN apjtechup.silver.products_clean p ON i.product_id = p.product_id
LEFT JOIN demand_analysis da ON i.product_id = da.product_id
LEFT JOIN inventory_financial if ON i.product_id = if.product_id AND i.warehouse_id = if.warehouse_id;
