# Master Bronze to Silver ELT Process

This notebook orchestrates the entire Bronze to Silver data transformation process. It executes all the individual transformation notebooks in the correct order and handles additional tables not covered in the individual notebooks.

The process includes:
1. Customer data transformation
2. Product data transformation
3. Order data transformation
4. Order items transformation
5. Inventory transformation
6. Reviews transformation
7. Web events transformation

Each step includes data quality checks and table optimization.


In [0]:
DECLARE OR REPLACE VARIABLE myusername STRING; 
SET VAR myusername = (select concat( 'silver_', replace(replace(current_user(),'@',''),'.','')));

In [0]:
USE catalog apjtechup;

In [0]:
create database if not exists IDENTIFIER(myusername);
use database IDENTIFIER(myusername);

In [0]:
-- Log transformation start
SELECT 'Starting Bronze to Silver ELT Process' as status, CURRENT_TIMESTAMP() as start_time;


In [0]:
-- 1. Transform Customers (Reference data first)
SELECT 'Transforming Customers...' as step, CURRENT_TIMESTAMP() as timestamp;


In [0]:
-- Insert cleaned customer data into silver layer
CREATE OR REPLACE TABLE customers_clean
SELECT 
    c.customer_id,
    TRIM(UPPER(c.first_name)) as first_name,
    TRIM(UPPER(c.last_name)) as last_name,
    CONCAT(TRIM(UPPER(c.first_name)), ' ', TRIM(UPPER(c.last_name))) as full_name,
    LOWER(TRIM(c.email)) as email,
    CASE 
        WHEN c.email LIKE '%@%' THEN SPLIT(LOWER(TRIM(c.email)), '@')[1]
        ELSE NULL 
    END as email_domain,
    c.phone,
    -- Clean phone numbers (remove non-numeric characters)
    REGEXP_REPLACE(c.phone, '[^0-9]', '') as phone_cleaned,
    c.registration_date,
    c.birth_date,
    -- Calculate age
    CASE 
        WHEN c.birth_date IS NOT NULL 
        THEN YEAR(CURRENT_DATE()) - YEAR(c.birth_date) - 
             CASE WHEN MONTH(CURRENT_DATE()) < MONTH(c.birth_date) 
                  OR (MONTH(CURRENT_DATE()) = MONTH(c.birth_date) AND DAY(CURRENT_DATE()) < DAY(c.birth_date))
                  THEN 1 ELSE 0 END
        ELSE NULL
    END as age,
    -- Age grouping
    CASE 
        WHEN c.birth_date IS NULL THEN 'Unknown'
        WHEN YEAR(CURRENT_DATE()) - YEAR(c.birth_date) < 25 THEN '18-24'
        WHEN YEAR(CURRENT_DATE()) - YEAR(c.birth_date) < 35 THEN '25-34'
        WHEN YEAR(CURRENT_DATE()) - YEAR(c.birth_date) < 45 THEN '35-44'
        WHEN YEAR(CURRENT_DATE()) - YEAR(c.birth_date) < 55 THEN '45-54'
        WHEN YEAR(CURRENT_DATE()) - YEAR(c.birth_date) < 65 THEN '55-64'
        ELSE '65+'
    END as age_group,
    COALESCE(UPPER(c.gender), 'Unknown') as gender,
    TRIM(c.address_line1) as address_line1,
    TRIM(c.address_line2) as address_line2,
    TRIM(UPPER(c.city)) as city,
    TRIM(UPPER(c.state)) as state,
    TRIM(UPPER(c.country)) as country,
    c.postal_code,
    YEAR(c.registration_date) as registration_year,
    MONTH(c.registration_date) as registration_month,
    DATEDIFF(CURRENT_DATE(), c.registration_date) as days_since_registration,
    -- Customer tier based on registration tenure
    CASE 
        WHEN DATEDIFF(CURRENT_DATE(), c.registration_date) >= 730 THEN 'Gold'
        WHEN DATEDIFF(CURRENT_DATE(), c.registration_date) >= 365 THEN 'Silver'
        WHEN DATEDIFF(CURRENT_DATE(), c.registration_date) >= 90 THEN 'Bronze'
        ELSE 'New'
    END as customer_tier,
    TRIM(LOWER(c.source_system)) as source_system,
    c.created_at,
    CURRENT_TIMESTAMP() as updated_at
FROM apjtechup.bronze.customers_raw c
WHERE c.customer_id IS NOT NULL
  AND c.email IS NOT NULL
  AND c.email LIKE '%@%'  -- Basic email validation
  AND LENGTH(TRIM(c.first_name)) > 0
  AND LENGTH(TRIM(c.last_name)) > 0;


In [0]:
-- Add data quality metrics
CREATE OR REPLACE TEMPORARY VIEW customer_quality_metrics AS
SELECT 
    COUNT(*) as total_customers,
    COUNT(CASE WHEN email_domain IS NOT NULL THEN 1 END) as valid_emails,
    COUNT(CASE WHEN phone_cleaned IS NOT NULL AND LENGTH(phone_cleaned) >= 10 THEN 1 END) as valid_phones,
    COUNT(CASE WHEN age IS NOT NULL THEN 1 END) as customers_with_age,
    COUNT(CASE WHEN age_group != 'Unknown' THEN 1 END) as customers_with_age_group,
    AVG(days_since_registration) as avg_days_since_registration,
    COUNT(CASE WHEN customer_tier = 'Gold' THEN 1 END) as gold_customers,
    COUNT(CASE WHEN customer_tier = 'Silver' THEN 1 END) as silver_customers,
    COUNT(CASE WHEN customer_tier = 'Bronze' THEN 1 END) as bronze_customers,
    COUNT(CASE WHEN customer_tier = 'New' THEN 1 END) as new_customers
FROM customers_clean;


In [0]:
SELECT * FROM customer_quality_metrics;


In [0]:
-- 2. Transform Products (Reference data)
SELECT 'Transforming Products...' as step, CURRENT_TIMESTAMP() as timestamp;


Run the products transformation notebook


In [0]:
-- Insert cleaned product data into silver layer
CREATE OR REPLACE TABLE products_clean
SELECT 
    p.product_id,
    TRIM(p.product_name) as product_name,
    -- Clean product name (remove extra spaces, special characters)
    REGEXP_REPLACE(TRIM(p.product_name), '\\s+', ' ') as product_name_clean,
    TRIM(p.description) as description,
    p.category_id,
    TRIM(c.category_name) as category_name,
    -- Build category path (category hierarchy)
    CASE 
        WHEN c.parent_category_id IS NOT NULL 
        THEN CONCAT(TRIM(pc.category_name), ' > ', TRIM(c.category_name))
        ELSE TRIM(c.category_name)
    END as category_path,
    p.supplier_id,
    TRIM(s.supplier_name) as supplier_name,
    p.sku,
    p.price,
    p.cost,
    -- Calculate profit margin
    ROUND(((p.price - p.cost) / NULLIF(p.price, 0)) * 100, 2) as profit_margin,
    -- Price tier classification
    CASE 
        WHEN p.price < 25 THEN 'Budget'
        WHEN p.price < 100 THEN 'Mid-Range'
        WHEN p.price < 500 THEN 'Premium'
        ELSE 'Luxury'
    END as price_tier,
    p.weight,
    -- Weight category
    CASE 
        WHEN p.weight IS NULL THEN 'Unknown'
        WHEN p.weight < 1 THEN 'Light'
        WHEN p.weight < 5 THEN 'Medium'
        WHEN p.weight < 20 THEN 'Heavy'
        ELSE 'Very Heavy'
    END as weight_category,
    p.dimensions,
    TRIM(UPPER(p.color)) as color,
    TRIM(UPPER(p.size)) as size,
    TRIM(p.brand) as brand,
    -- Brand category based on frequency
    CASE 
        WHEN brand_counts.product_count >= 100 THEN 'Major Brand'
        WHEN brand_counts.product_count >= 20 THEN 'Popular Brand'
        WHEN brand_counts.product_count >= 5 THEN 'Niche Brand'
        ELSE 'Boutique Brand'
    END as brand_category,
    CASE WHEN UPPER(p.status) = 'ACTIVE' THEN TRUE ELSE FALSE END as is_active,
    -- Product lifecycle stage
    CASE 
        WHEN UPPER(p.status) = 'DISCONTINUED' THEN 'Discontinued'
        WHEN DATEDIFF(CURRENT_DATE(), p.created_at) <= 30 THEN 'New Launch'
        WHEN DATEDIFF(CURRENT_DATE(), p.created_at) <= 180 THEN 'Growth'
        WHEN DATEDIFF(CURRENT_DATE(), p.created_at) <= 730 THEN 'Mature'
        ELSE 'Legacy'
    END as product_lifecycle_stage,
    p.created_at,
    p.updated_at,
    DATEDIFF(CURRENT_DATE(), p.created_at) as days_since_launch,
    CURRENT_TIMESTAMP() as processing_timestamp
FROM apjtechup.bronze.products_raw p
LEFT JOIN apjtechup.bronze.categories_raw c ON p.category_id = c.category_id
LEFT JOIN apjtechup.bronze.categories_raw pc ON c.parent_category_id = pc.category_id
LEFT JOIN apjtechup.bronze.suppliers_raw s ON p.supplier_id = s.supplier_id
LEFT JOIN (
    -- Calculate brand frequency for brand categorization
    SELECT 
        brand,
        COUNT(*) as product_count
    FROM apjtechup.bronze.products_raw 
    WHERE brand IS NOT NULL
    GROUP BY brand
) brand_counts ON p.brand = brand_counts.brand
WHERE p.product_id IS NOT NULL
  AND p.product_name IS NOT NULL
  AND LENGTH(TRIM(p.product_name)) > 0
  AND p.price > 0
  AND p.cost >= 0;


In [0]:
-- Create temporary view for product quality metrics
CREATE OR REPLACE TEMPORARY VIEW product_quality_metrics AS
SELECT 
    COUNT(*) as total_products,
    COUNT(CASE WHEN is_active = TRUE THEN 1 END) as active_products,
    COUNT(CASE WHEN category_name IS NOT NULL THEN 1 END) as products_with_categories,
    COUNT(CASE WHEN supplier_name IS NOT NULL THEN 1 END) as products_with_suppliers,
    COUNT(CASE WHEN profit_margin > 0 THEN 1 END) as profitable_products,
    AVG(profit_margin) as avg_profit_margin,
    COUNT(CASE WHEN price_tier = 'Budget' THEN 1 END) as budget_products,
    COUNT(CASE WHEN price_tier = 'Mid-Range' THEN 1 END) as midrange_products,
    COUNT(CASE WHEN price_tier = 'Premium' THEN 1 END) as premium_products,
    COUNT(CASE WHEN price_tier = 'Luxury' THEN 1 END) as luxury_products,
    COUNT(CASE WHEN product_lifecycle_stage = 'New Launch' THEN 1 END) as new_launches,
    COUNT(CASE WHEN product_lifecycle_stage = 'Growth' THEN 1 END) as growth_stage,
    COUNT(CASE WHEN product_lifecycle_stage = 'Mature' THEN 1 END) as mature_products,
    COUNT(CASE WHEN product_lifecycle_stage = 'Legacy' THEN 1 END) as legacy_products,
    COUNT(CASE WHEN product_lifecycle_stage = 'Discontinued' THEN 1 END) as discontinued_products
FROM products_clean;


In [0]:
-- Category distribution
CREATE OR REPLACE TEMPORARY VIEW category_distribution AS
SELECT 
    category_name,
    COUNT(*) as product_count,
    AVG(price) as avg_price,
    AVG(profit_margin) as avg_margin,
    COUNT(CASE WHEN is_active = TRUE THEN 1 END) as active_count
FROM products_clean
WHERE category_name IS NOT NULL
GROUP BY category_name
ORDER BY product_count DESC;


In [0]:
-- Brand distribution
CREATE OR REPLACE TEMPORARY VIEW brand_distribution AS
SELECT 
    brand,
    brand_category,
    COUNT(*) as product_count,
    AVG(price) as avg_price,
    AVG(profit_margin) as avg_margin
FROM products_clean
WHERE brand IS NOT NULL
GROUP BY brand, brand_category
ORDER BY product_count DESC
LIMIT 20;


In [0]:
-- Display quality metrics
SELECT 'Product Quality Metrics' as metric_type;
SELECT * FROM product_quality_metrics;

SELECT 'Top Categories' as metric_type;
SELECT * FROM category_distribution LIMIT 10;

SELECT 'Top Brands' as metric_type;
SELECT * FROM brand_distribution;


In [0]:
-- 3. Transform Orders (Transaction data)
SELECT 'Transforming Orders...' as step, CURRENT_TIMESTAMP() as timestamp;


Run the orders transformation notebook


In [0]:
-- Insert cleaned order data into silver layer
CREATE OR REPLACE TABLE orders_clean
SELECT 
    o.order_id,
    o.customer_id,
    o.order_date,
    DATE(o.order_date) as order_date_only,
    YEAR(o.order_date) as order_year,
    MONTH(o.order_date) as order_month,
    QUARTER(o.order_date) as order_quarter,
    CASE DAYOFWEEK(o.order_date)
        WHEN 1 THEN 'Sunday'
        WHEN 2 THEN 'Monday'
        WHEN 3 THEN 'Tuesday'
        WHEN 4 THEN 'Wednesday'
        WHEN 5 THEN 'Thursday'
        WHEN 6 THEN 'Friday'
        WHEN 7 THEN 'Saturday'
    END as order_day_of_week,
    HOUR(o.order_date) as order_hour,
    UPPER(TRIM(o.order_status)) as order_status,
    -- Order status categorization
    CASE 
        WHEN UPPER(TRIM(o.order_status)) IN ('PENDING', 'PROCESSING') THEN 'In Progress'
        WHEN UPPER(TRIM(o.order_status)) IN ('SHIPPED', 'DELIVERED') THEN 'Completed'
        WHEN UPPER(TRIM(o.order_status)) IN ('CANCELLED', 'RETURNED') THEN 'Cancelled/Returned'
        ELSE 'Other'
    END as order_status_category,
    LOWER(TRIM(o.payment_method)) as payment_method,
    -- Payment method categorization
    CASE 
        WHEN LOWER(TRIM(o.payment_method)) IN ('credit_card', 'debit_card') THEN 'Card Payment'
        WHEN LOWER(TRIM(o.payment_method)) IN ('paypal', 'apple_pay', 'google_pay') THEN 'Digital Wallet'
        ELSE 'Other'
    END as payment_method_category,
    UPPER(TRIM(o.payment_status)) as payment_status,
    TRIM(o.shipping_address) as shipping_address,
    TRIM(o.billing_address) as billing_address,
    -- Check if billing and shipping addresses are the same
    CASE 
        WHEN TRIM(o.shipping_address) = TRIM(o.billing_address) THEN TRUE
        ELSE FALSE
    END as same_billing_shipping,
    o.total_amount,
    o.tax_amount,
    o.shipping_cost,
    o.discount_amount,
    -- Calculate net amount (total - tax for comparison purposes)
    ROUND(o.total_amount - COALESCE(o.tax_amount, 0), 2) as net_amount,
    -- Order value tier
    CASE 
        WHEN o.total_amount < 50 THEN 'Small'
        WHEN o.total_amount < 150 THEN 'Medium'
        WHEN o.total_amount < 500 THEN 'Large'
        ELSE 'Enterprise'
    END as order_value_tier,
    CASE WHEN COALESCE(o.discount_amount, 0) > 0 THEN TRUE ELSE FALSE END as has_discount,
    -- Calculate discount percentage
    CASE 
        WHEN o.total_amount > 0 AND COALESCE(o.discount_amount, 0) > 0 
        THEN ROUND((o.discount_amount / o.total_amount) * 100, 2)
        ELSE 0
    END as discount_percentage,
    o.created_at,
    o.updated_at,
    CURRENT_TIMESTAMP() as processing_timestamp
FROM apjtechup.bronze.orders_raw o
WHERE o.order_id IS NOT NULL
  AND o.customer_id IS NOT NULL
  AND o.order_date IS NOT NULL
  AND o.total_amount >= 0;


In [0]:
-- Create temporary views for order analytics
CREATE OR REPLACE TEMPORARY VIEW order_quality_metrics AS
SELECT 
    COUNT(*) as total_orders,
    COUNT(CASE WHEN order_status_category = 'Completed' THEN 1 END) as completed_orders,
    COUNT(CASE WHEN order_status_category = 'In Progress' THEN 1 END) as in_progress_orders,
    COUNT(CASE WHEN order_status_category = 'Cancelled/Returned' THEN 1 END) as cancelled_orders,
    ROUND(AVG(total_amount), 2) as avg_order_value,
    ROUND(SUM(total_amount), 2) as total_revenue,
    COUNT(CASE WHEN has_discount = TRUE THEN 1 END) as orders_with_discount,
    ROUND(AVG(CASE WHEN has_discount = TRUE THEN discount_percentage END), 2) as avg_discount_percentage,
    COUNT(CASE WHEN order_value_tier = 'Small' THEN 1 END) as small_orders,
    COUNT(CASE WHEN order_value_tier = 'Medium' THEN 1 END) as medium_orders,
    COUNT(CASE WHEN order_value_tier = 'Large' THEN 1 END) as large_orders,
    COUNT(CASE WHEN order_value_tier = 'Enterprise' THEN 1 END) as enterprise_orders
FROM orders_clean;


In [0]:
-- Payment method analysis
CREATE OR REPLACE TEMPORARY VIEW payment_method_analysis AS
SELECT 
    payment_method_category,
    payment_method,
    COUNT(*) as order_count,
    ROUND(AVG(total_amount), 2) as avg_order_value,
    COUNT(CASE WHEN order_status_category = 'Completed' THEN 1 END) as completed_count,
    ROUND(COUNT(CASE WHEN order_status_category = 'Completed' THEN 1 END) * 100.0 / COUNT(*), 2) as completion_rate
FROM orders_clean
GROUP BY payment_method_category, payment_method
ORDER BY order_count DESC;


In [0]:
-- Temporal patterns
CREATE OR REPLACE TEMPORARY VIEW temporal_patterns AS
SELECT 
    order_day_of_week,
    COUNT(*) as order_count,
    ROUND(AVG(total_amount), 2) as avg_order_value,
    ROUND(SUM(total_amount), 2) as total_revenue
FROM orders_clean
GROUP BY order_day_of_week
ORDER BY 
    CASE order_day_of_week
        WHEN 'Monday' THEN 1
        WHEN 'Tuesday' THEN 2
        WHEN 'Wednesday' THEN 3
        WHEN 'Thursday' THEN 4
        WHEN 'Friday' THEN 5
        WHEN 'Saturday' THEN 6
        WHEN 'Sunday' THEN 7
    END;


In [0]:
-- Hourly patterns
CREATE OR REPLACE TEMPORARY VIEW hourly_patterns AS
SELECT 
    order_hour,
    COUNT(*) as order_count,
    ROUND(AVG(total_amount), 2) as avg_order_value
FROM orders_clean
GROUP BY order_hour
ORDER BY order_hour;


In [0]:
-- Monthly trends
CREATE OR REPLACE TEMPORARY VIEW monthly_trends AS
SELECT 
    order_year,
    order_month,
    COUNT(*) as order_count,
    ROUND(AVG(total_amount), 2) as avg_order_value,
    ROUND(SUM(total_amount), 2) as total_revenue,
    COUNT(DISTINCT customer_id) as unique_customers
FROM orders_clean
GROUP BY order_year, order_month
ORDER BY order_year, order_month;


In [0]:
-- Display analytics
SELECT 'Order Quality Metrics' as metric_type;
SELECT * FROM order_quality_metrics;

SELECT 'Payment Method Analysis' as metric_type;
SELECT * FROM payment_method_analysis;

SELECT 'Day of Week Patterns' as metric_type;
SELECT * FROM temporal_patterns;

SELECT 'Hourly Patterns (Top 10)' as metric_type;
SELECT * FROM hourly_patterns ORDER BY order_count DESC LIMIT 10;

SELECT 'Monthly Trends' as metric_type;
SELECT * FROM monthly_trends ORDER BY order_year DESC, order_month DESC LIMIT 12;


In [0]:
-- 4. Transform Order Items (Large transaction table)
SELECT 'Transforming Order Items...' as step, CURRENT_TIMESTAMP() as timestamp;


CREATE OR REPLACE TABLE order_items_clean
SELECT 
    oi.order_item_id,
    oi.order_id,
    oi.product_id,
    TRIM(p.product_name) as product_name,
    TRIM(c.category_name) as category_name,
    TRIM(p.brand) as brand,
    oi.quantity,
    oi.unit_price,
    oi.discount_amount,
    oi.total_amount,
    p.cost as unit_cost,
    ROUND(oi.total_amount - (p.cost * oi.quantity), 2) as item_profit,
    CASE 
        WHEN oi.total_amount > 0 
        THEN ROUND(((oi.total_amount - (p.cost * oi.quantity)) / oi.total_amount) * 100, 2)
        ELSE 0
    END as item_margin,
    CASE WHEN COALESCE(oi.discount_amount, 0) > 0 THEN TRUE ELSE FALSE END as is_discounted,
    CASE 
        WHEN oi.quantity = 1 THEN 'Single'
        WHEN oi.quantity <= 3 THEN 'Small'
        WHEN oi.quantity <= 10 THEN 'Medium'
        ELSE 'Bulk'
    END as quantity_tier,
    oi.created_at,
    CURRENT_TIMESTAMP() as processing_timestamp
FROM apjtechup.bronze.order_items_raw oi
LEFT JOIN apjtechup.bronze.products_raw p ON oi.product_id = p.product_id
LEFT JOIN apjtechup.bronze.categories_raw c ON p.category_id = c.category_id
WHERE oi.order_item_id IS NOT NULL
  AND oi.order_id IS NOT NULL
  AND oi.product_id IS NOT NULL
  AND oi.quantity > 0
  AND oi.unit_price >= 0;


In [0]:
-- 5. Transform Inventory
SELECT 'Transforming Inventory...' as step, CURRENT_TIMESTAMP() as timestamp;

CREATE OR REPLACE TABLE inventory_clean
SELECT 
    i.inventory_id,
    i.product_id,
    TRIM(p.product_name) as product_name,
    TRIM(c.category_name) as category_name,
    i.warehouse_id,
    CASE 
        WHEN i.warehouse_id <= 5 THEN 'East'
        WHEN i.warehouse_id <= 10 THEN 'Central'
        WHEN i.warehouse_id <= 15 THEN 'West'
        ELSE 'International'
    END as warehouse_region,
    i.quantity_on_hand,
    i.quantity_reserved,
    i.quantity_available,
    i.reorder_level,
    CASE 
        WHEN i.quantity_available <= 0 THEN 'Out of Stock'
        WHEN i.quantity_available <= i.reorder_level THEN 'Low Stock'
        WHEN i.quantity_available <= i.reorder_level * 2 THEN 'Normal Stock'
        ELSE 'High Stock'
    END as stock_status,
    CASE 
        WHEN p.cost > 0 AND i.quantity_available > 0
        THEN ROUND(i.quantity_available / (p.cost * 30), 1) -- Rough days of supply estimate
        ELSE 0
    END as days_of_supply,
    CASE 
        WHEN i.quantity_available <= i.reorder_level * 0.5 THEN 'Fast Moving'
        WHEN i.quantity_available <= i.reorder_level * 2 THEN 'Normal Moving'
        ELSE 'Slow Moving'
    END as turnover_category,
    CASE WHEN i.quantity_available <= i.reorder_level THEN TRUE ELSE FALSE END as needs_reorder,
    CASE WHEN i.quantity_available >= i.reorder_level * 5 THEN TRUE ELSE FALSE END as overstocked,
    i.last_updated,
    i.created_at,
    CURRENT_TIMESTAMP() as processing_timestamp
FROM apjtechup.bronze.inventory_raw i
LEFT JOIN apjtechup.bronze.products_raw p ON i.product_id = p.product_id
LEFT JOIN apjtechup.bronze.categories_raw c ON p.category_id = c.category_id
WHERE i.inventory_id IS NOT NULL
  AND i.product_id IS NOT NULL
  AND i.quantity_on_hand >= 0;


In [0]:
-- 6. Transform Reviews
SELECT 'Transforming Reviews...' as step, CURRENT_TIMESTAMP() as timestamp;

CREATE OR REPLACE TABLE reviews_clean
SELECT 
    r.review_id,
    r.product_id,
    TRIM(p.product_name) as product_name,
    TRIM(c.category_name) as category_name,
    r.customer_id,
    r.rating,
    CASE 
        WHEN r.rating >= 4 THEN 'Positive'
        WHEN r.rating >= 3 THEN 'Neutral'
        ELSE 'Negative'
    END as rating_category,
    r.review_text,
    LENGTH(COALESCE(r.review_text, '')) as review_text_length,
    -- Simple sentiment scoring based on rating
    CASE 
        WHEN r.rating = 5 THEN 1.0
        WHEN r.rating = 4 THEN 0.5
        WHEN r.rating = 3 THEN 0.0
        WHEN r.rating = 2 THEN -0.5
        WHEN r.rating = 1 THEN -1.0
        ELSE 0.0
    END as sentiment_score,
    CASE 
        WHEN r.rating >= 4 THEN 'Positive'
        WHEN r.rating >= 3 THEN 'Neutral'
        ELSE 'Negative'
    END as sentiment_category,
    r.review_date,
    YEAR(r.review_date) as review_year,
    MONTH(r.review_date) as review_month,
    r.verified_purchase,
    r.helpful_votes,
    CASE 
        WHEN r.helpful_votes >= 20 THEN 'Very Helpful'
        WHEN r.helpful_votes >= 5 THEN 'Helpful'
        WHEN r.helpful_votes >= 1 THEN 'Somewhat Helpful'
        ELSE 'Not Helpful'
    END as helpfulness_tier,
    DATEDIFF(r.review_date, '2023-01-01') as days_since_purchase, -- Placeholder
    CASE 
        WHEN DATEDIFF(CURRENT_DATE(), r.review_date) <= 30 THEN 'Recent'
        WHEN DATEDIFF(CURRENT_DATE(), r.review_date) <= 90 THEN 'Moderate'
        ELSE 'Old'
    END as review_recency,
    r.created_at,
    CURRENT_TIMESTAMP() as processing_timestamp
FROM apjtechup.bronze.reviews_raw r
LEFT JOIN apjtechup.bronze.products_raw p ON r.product_id = p.product_id
LEFT JOIN apjtechup.bronze.categories_raw c ON p.category_id = c.category_id
WHERE r.review_id IS NOT NULL
  AND r.product_id IS NOT NULL
  AND r.rating BETWEEN 1 AND 5;


In [0]:
-- 7. Transform Web Events
SELECT 'Transforming Web Events...' as step, CURRENT_TIMESTAMP() as timestamp;

CREATE OR REPLACE TABLE web_events_clean
SELECT 
    we.event_id,
    we.session_id,
    we.customer_id,
    LOWER(TRIM(we.event_type)) as event_type,
    CASE 
        WHEN LOWER(TRIM(we.event_type)) IN ('page_view', 'product_view') THEN 'Browsing'
        WHEN LOWER(TRIM(we.event_type)) IN ('add_to_cart', 'remove_from_cart') THEN 'Cart Activity'
        WHEN LOWER(TRIM(we.event_type)) IN ('checkout_start', 'purchase') THEN 'Purchase'
        WHEN LOWER(TRIM(we.event_type)) IN ('search', 'filter') THEN 'Search'
        ELSE 'Other'
    END as event_category,
    we.page_url,
    CASE 
        WHEN we.page_url LIKE '%/product%' THEN 'Product Page'
        WHEN we.page_url LIKE '%/category%' THEN 'Category Page'
        WHEN we.page_url LIKE '%/cart%' THEN 'Cart Page'
        WHEN we.page_url LIKE '%/checkout%' THEN 'Checkout Page'
        WHEN we.page_url LIKE '%/search%' THEN 'Search Page'
        WHEN we.page_url = '/home' THEN 'Home Page'
        ELSE 'Other Page'
    END as page_category,
    we.product_id,
    TRIM(c.category_name) as product_category,
    we.timestamp,
    DATE(we.timestamp) as event_date,
    HOUR(we.timestamp) as event_hour,
    CASE DAYOFWEEK(we.timestamp)
        WHEN 1 THEN 'Sunday'
        WHEN 2 THEN 'Monday'
        WHEN 3 THEN 'Tuesday'
        WHEN 4 THEN 'Wednesday'
        WHEN 5 THEN 'Thursday'
        WHEN 6 THEN 'Friday'
        WHEN 7 THEN 'Saturday'
    END as event_day_of_week,
    we.ip_address,
    we.user_agent,
    CASE 
        WHEN LOWER(we.user_agent) LIKE '%mobile%' OR LOWER(we.user_agent) LIKE '%android%' OR LOWER(we.user_agent) LIKE '%iphone%' THEN 'Mobile'
        WHEN LOWER(we.user_agent) LIKE '%tablet%' OR LOWER(we.user_agent) LIKE '%ipad%' THEN 'Tablet'
        ELSE 'Desktop'
    END as device_type,
    CASE 
        WHEN LOWER(we.user_agent) LIKE '%chrome%' THEN 'Chrome'
        WHEN LOWER(we.user_agent) LIKE '%firefox%' THEN 'Firefox'
        WHEN LOWER(we.user_agent) LIKE '%safari%' THEN 'Safari'
        WHEN LOWER(we.user_agent) LIKE '%edge%' THEN 'Edge'
        ELSE 'Other'
    END as browser,
    CASE 
        WHEN LOWER(we.user_agent) LIKE '%windows%' THEN 'Windows'
        WHEN LOWER(we.user_agent) LIKE '%mac%' THEN 'MacOS'
        WHEN LOWER(we.user_agent) LIKE '%linux%' THEN 'Linux'
        WHEN LOWER(we.user_agent) LIKE '%android%' THEN 'Android'
        WHEN LOWER(we.user_agent) LIKE '%ios%' THEN 'iOS'
        ELSE 'Other'
    END as operating_system,
    we.referrer,
    CASE 
        WHEN we.referrer IS NULL THEN 'Direct'
        WHEN LOWER(we.referrer) LIKE '%google%' THEN 'Google'
        WHEN LOWER(we.referrer) LIKE '%facebook%' THEN 'Facebook'
        WHEN LOWER(we.referrer) LIKE '%twitter%' THEN 'Twitter'
        WHEN LOWER(we.referrer) LIKE '%instagram%' THEN 'Instagram'
        ELSE 'Other Referrer'
    END as referrer_category,
    CASE 
        WHEN LOWER(we.user_agent) LIKE '%mobile%' OR LOWER(we.user_agent) LIKE '%android%' OR LOWER(we.user_agent) LIKE '%iphone%' THEN TRUE
        ELSE FALSE
    END as is_mobile,
    'Unknown' as session_duration_bucket, -- Would need session analysis for this
    we.created_at,
    CURRENT_TIMESTAMP() as processing_timestamp
FROM apjtechup.bronze.web_events_raw we
LEFT JOIN apjtechup.bronze.products_raw p ON we.product_id = p.product_id
LEFT JOIN apjtechup.bronze.categories_raw c ON p.category_id = c.category_id
WHERE we.event_id IS NOT NULL
  AND we.timestamp IS NOT NULL;


In [0]:
-- Final summary and optimization
SELECT 'Optimizing all Silver tables...' as step, CURRENT_TIMESTAMP() as timestamp;

OPTIMIZE customers_clean;
OPTIMIZE products_clean;
OPTIMIZE orders_clean;
OPTIMIZE order_items_clean;
OPTIMIZE inventory_clean;
OPTIMIZE reviews_clean;
OPTIMIZE web_events_clean;

-- Update statistics
ANALYZE TABLE customers_clean COMPUTE STATISTICS;
ANALYZE TABLE products_clean COMPUTE STATISTICS;
ANALYZE TABLE orders_clean COMPUTE STATISTICS;
ANALYZE TABLE order_items_clean COMPUTE STATISTICS;
ANALYZE TABLE inventory_clean COMPUTE STATISTICS;
ANALYZE TABLE reviews_clean COMPUTE STATISTICS;
ANALYZE TABLE web_events_clean COMPUTE STATISTICS;