# Bronze to Silver ELT: Product Data Transformation

This notebook cleans and enriches product data from the bronze layer to create the silver layer product tables. The transformation includes:

- Data cleaning and standardization
- Product categorization and enrichment
- Brand analysis and categorization
- Product lifecycle tracking
- Data quality metrics calculation
- Table optimization


In [0]:

-- Set up the environment
USE catalog apjtechup;
use database silver;


In [0]:
-- Insert cleaned product data into silver layer
INSERT OVERWRITE TABLE products_clean
SELECT 
    p.product_id,
    TRIM(p.product_name) as product_name,
    -- Clean product name (remove extra spaces, special characters)
    REGEXP_REPLACE(TRIM(p.product_name), '\\s+', ' ') as product_name_clean,
    TRIM(p.description) as description,
    p.category_id,
    TRIM(c.category_name) as category_name,
    -- Build category path (category hierarchy)
    CASE 
        WHEN c.parent_category_id IS NOT NULL 
        THEN CONCAT(TRIM(pc.category_name), ' > ', TRIM(c.category_name))
        ELSE TRIM(c.category_name)
    END as category_path,
    p.supplier_id,
    TRIM(s.supplier_name) as supplier_name,
    p.sku,
    p.price,
    p.cost,
    -- Calculate profit margin
    ROUND(((p.price - p.cost) / NULLIF(p.price, 0)) * 100, 2) as profit_margin,
    -- Price tier classification
    CASE 
        WHEN p.price < 25 THEN 'Budget'
        WHEN p.price < 100 THEN 'Mid-Range'
        WHEN p.price < 500 THEN 'Premium'
        ELSE 'Luxury'
    END as price_tier,
    p.weight,
    -- Weight category
    CASE 
        WHEN p.weight IS NULL THEN 'Unknown'
        WHEN p.weight < 1 THEN 'Light'
        WHEN p.weight < 5 THEN 'Medium'
        WHEN p.weight < 20 THEN 'Heavy'
        ELSE 'Very Heavy'
    END as weight_category,
    p.dimensions,
    TRIM(UPPER(p.color)) as color,
    TRIM(UPPER(p.size)) as size,
    TRIM(p.brand) as brand,
    -- Brand category based on frequency
    CASE 
        WHEN brand_counts.product_count >= 100 THEN 'Major Brand'
        WHEN brand_counts.product_count >= 20 THEN 'Popular Brand'
        WHEN brand_counts.product_count >= 5 THEN 'Niche Brand'
        ELSE 'Boutique Brand'
    END as brand_category,
    CASE WHEN UPPER(p.status) = 'ACTIVE' THEN TRUE ELSE FALSE END as is_active,
    -- Product lifecycle stage
    CASE 
        WHEN UPPER(p.status) = 'DISCONTINUED' THEN 'Discontinued'
        WHEN DATEDIFF(CURRENT_DATE(), p.created_at) <= 30 THEN 'New Launch'
        WHEN DATEDIFF(CURRENT_DATE(), p.created_at) <= 180 THEN 'Growth'
        WHEN DATEDIFF(CURRENT_DATE(), p.created_at) <= 730 THEN 'Mature'
        ELSE 'Legacy'
    END as product_lifecycle_stage,
    p.created_at,
    p.updated_at,
    DATEDIFF(CURRENT_DATE(), p.created_at) as days_since_launch,
    CURRENT_TIMESTAMP() as processing_timestamp
FROM apjtechup.bronze.products_raw p
LEFT JOIN apjtechup.bronze.categories_raw c ON p.category_id = c.category_id
LEFT JOIN apjtechup.bronze.categories_raw pc ON c.parent_category_id = pc.category_id
LEFT JOIN apjtechup.bronze.suppliers_raw s ON p.supplier_id = s.supplier_id
LEFT JOIN (
    -- Calculate brand frequency for brand categorization
    SELECT 
        brand,
        COUNT(*) as product_count
    FROM apjtechup.bronze.products_raw 
    WHERE brand IS NOT NULL
    GROUP BY brand
) brand_counts ON p.brand = brand_counts.brand
WHERE p.product_id IS NOT NULL
  AND p.product_name IS NOT NULL
  AND LENGTH(TRIM(p.product_name)) > 0
  AND p.price > 0
  AND p.cost >= 0;


In [0]:
-- Create temporary view for product quality metrics
CREATE OR REPLACE TEMPORARY VIEW product_quality_metrics AS
SELECT 
    COUNT(*) as total_products,
    COUNT(CASE WHEN is_active = TRUE THEN 1 END) as active_products,
    COUNT(CASE WHEN category_name IS NOT NULL THEN 1 END) as products_with_categories,
    COUNT(CASE WHEN supplier_name IS NOT NULL THEN 1 END) as products_with_suppliers,
    COUNT(CASE WHEN profit_margin > 0 THEN 1 END) as profitable_products,
    AVG(profit_margin) as avg_profit_margin,
    COUNT(CASE WHEN price_tier = 'Budget' THEN 1 END) as budget_products,
    COUNT(CASE WHEN price_tier = 'Mid-Range' THEN 1 END) as midrange_products,
    COUNT(CASE WHEN price_tier = 'Premium' THEN 1 END) as premium_products,
    COUNT(CASE WHEN price_tier = 'Luxury' THEN 1 END) as luxury_products,
    COUNT(CASE WHEN product_lifecycle_stage = 'New Launch' THEN 1 END) as new_launches,
    COUNT(CASE WHEN product_lifecycle_stage = 'Growth' THEN 1 END) as growth_stage,
    COUNT(CASE WHEN product_lifecycle_stage = 'Mature' THEN 1 END) as mature_products,
    COUNT(CASE WHEN product_lifecycle_stage = 'Legacy' THEN 1 END) as legacy_products,
    COUNT(CASE WHEN product_lifecycle_stage = 'Discontinued' THEN 1 END) as discontinued_products
FROM products_clean;


In [0]:
-- Category distribution
CREATE OR REPLACE TEMPORARY VIEW category_distribution AS
SELECT 
    category_name,
    COUNT(*) as product_count,
    AVG(price) as avg_price,
    AVG(profit_margin) as avg_margin,
    COUNT(CASE WHEN is_active = TRUE THEN 1 END) as active_count
FROM products_clean
WHERE category_name IS NOT NULL
GROUP BY category_name
ORDER BY product_count DESC;


In [0]:
-- Brand distribution
CREATE OR REPLACE TEMPORARY VIEW brand_distribution AS
SELECT 
    brand,
    brand_category,
    COUNT(*) as product_count,
    AVG(price) as avg_price,
    AVG(profit_margin) as avg_margin
FROM products_clean
WHERE brand IS NOT NULL
GROUP BY brand, brand_category
ORDER BY product_count DESC
LIMIT 20;


In [0]:
-- Display quality metrics
SELECT 'Product Quality Metrics' as metric_type;
SELECT * FROM product_quality_metrics;

SELECT 'Top Categories' as metric_type;
SELECT * FROM category_distribution LIMIT 10;

SELECT 'Top Brands' as metric_type;
SELECT * FROM brand_distribution;


In [0]:
-- Optimize table
OPTIMIZE products_clean;

-- Update table statistics
ANALYZE TABLE products_clean COMPUTE STATISTICS;
