
#Use necessary catalog and schema

In [0]:
%sql
USE CATALOG online_retail;
CREATE SCHEMA if not exists silver;
USE SCHEMA silver;

#Create source view from raw_data

In [0]:
raw_df = spark.sql("SELECT * FROM bronze.raw_data")
raw_df.createOrReplaceTempView("uvw_raw_df")


#Merge Data into Target Table

In [0]:
%sql
CREATE TABLE IF NOT EXISTS cleaned_data (
  invoice_number STRING NOT NULL
  , product_code STRING NOT NULL
  , product_description STRING NOT NULL
  , quantity INTEGER
  , invoice_date TIMESTAMP
  , unit_price DOUBLE
  , customer_id INT
  , country STRING
);

ALTER TABLE cleaned_data DROP CONSTRAINT IF EXISTS quantity_check;
ALTER TABLE cleaned_data ADD CONSTRAINT quantity_check CHECK (quantity >= 0);

ALTER TABLE cleaned_data DROP CONSTRAINT IF EXISTS unit_price_check;
ALTER TABLE cleaned_data ADD CONSTRAINT unit_price_check CHECK (unit_price >= 0);

In [0]:
%sql

--VACUUM cleaned_data RETAIN 168 HOURS;  -- Commented out as it is not needed to run everytime

In [0]:
%sql

WITH CTE_cleaned_data AS (
    SELECT DISTINCT                                                              -- Dropping duplicates
        COALESCE(InvoiceNo, 'Unknown')                              AS invoice_number         -- Replace NULL values with 'Unknown'
        , COALESCE(StockCode, 'Unknown')                            AS product_code           -- Replace NULL values with 'Unknown'
        , COALESCE(Description, 'Unknown')                          AS product_description    -- Replace NULL values with 'Unknown'
        , COALESCE(Quantity, 0)                                     AS quantity               -- Replace NULL values with 0
        , COALESCE(InvoiceDate, '9999-01-01T00:00:00.000+00:00')    AS invoice_date           -- Replace NULL values with default date 
        , COALESCE(UnitPrice, 0)                                    AS unit_price             -- Replace NULL values with 0
        , COALESCE(CustomerID, 0)                                   AS customer_id            -- Replace NULL values with 0
        , COALESCE(Country, 'Unknown')                              AS country                -- Replace NULL values with 'Unknown'
    FROM uvw_raw_df
    WHERE Quantity >= 0 AND UnitPrice >= 0                                      -- Dropping records that have negative quantity and unit prices
)


-- Merging on all columns to load all records
MERGE INTO cleaned_data AS TGT
USING CTE_cleaned_data AS SRC
ON TGT.invoice_number        = SRC.invoice_number
  AND TGT.product_code       = SRC.product_code
  AND TGT.product_description     = SRC.product_description
  AND TGT.quantity        = SRC.quantity
  AND TGT.invoice_date     = SRC.invoice_date
  AND TGT.unit_price       = SRC.unit_price
  AND TGT.customer_id      = SRC.customer_id
  AND TGT.country         = SRC.country
WHEN NOT MATCHED 
THEN INSERT *


#Data Quality Monitoring and Alerts

In [0]:
%sql
CREATE TABLE IF NOT EXISTS data_quality_checks (
    check_name STRING,
    layer STRING,
    issue_count BIGINT,
    last_checked TIMESTAMP
);


CREATE TABLE IF NOT EXISTS data_quality_alerts (
    alert_id STRING,
    issue STRING,
    layer STRING,
    detected_at TIMESTAMP,
    severity STRING
);

In [0]:
%sql
-- Check for missing Customer IDs
INSERT INTO data_quality_checks (
  check_name
  , layer
  , issue_count
  , last_checked
)
SELECT 
  'Missing CustomerID'
  , 'Silver'
  , COUNT(*)
  , current_timestamp
FROM cleaned_data
WHERE customer_id = 0;

In [0]:
%sql

-- Check for negative or zero prices
INSERT INTO data_quality_checks (
  check_name
  , layer
  , issue_count
  , last_checked
)
SELECT 
  'Negative or Zero Price'
  , 'Silver'
  , COUNT(*)
  , current_timestamp
FROM cleaned_data
WHERE unit_price <= 0;

In [0]:
%sql

-- Check for duplicate invoices
INSERT INTO data_quality_checks (
  check_name
  , layer
  , issue_count
  , last_checked
)
SELECT 'Duplicate Invoices', 'Silver', COUNT(*), current_timestamp
FROM (
    SELECT invoice_number, customer_id, product_code, COUNT(*) AS count
    FROM cleaned_data
    GROUP BY invoice_number, customer_id, product_code
    HAVING COUNT(*) > 1
) duplicates;

In [0]:
%sql
-- Data-Quality-Alerts
-- Insert alerts for missing customersIDs
INSERT INTO data_quality_alerts (
  alert_id
  , issue
  , layer
  , severity
  , detected_at
)
SELECT 
  UUID()
  , 'High number of missing CustomerIDs'
  , 'Silver'
  , 'High'
  , current_timestamp
WHERE (SELECT COUNT(*) FROM cleaned_data WHERE customer_id IS NULL) > 1000;

In [0]:
%sql
--Insert alerts for negative or zero prices
INSERT INTO data_quality_alerts (
  alert_id
  , issue
  , layer
  , severity
  , detected_at
)
SELECT 
  UUID()
  , 'Negative or zero prices detected'
  , 'Silver'
  , 'Medium'
  , current_timestamp
WHERE (SELECT COUNT(*) FROM cleaned_data WHERE unit_price <= 0) > 100;

In [0]:
%sql
--Insert alerts for duplicate invoices
INSERT INTO data_quality_alerts (
  alert_id
  , issue
  , layer
  , severity
  , detected_at
)
SELECT 
  UUID()
  , 'Duplicate invoices detected'
  , 'Silver'
  , 'Low'
  , current_timestamp
WHERE (SELECT COUNT(*) FROM (
    SELECT invoice_number, customer_id, product_code, COUNT(*) AS count
    FROM cleaned_data
    GROUP BY invoice_number, customer_id, product_code
    HAVING COUNT(*) > 1
) duplicates) > 50;
