In [0]:
%sql
USE CATALOG online_retail;
CREATE SCHEMA if not exists silver;
USE SCHEMA silver;

In [0]:
raw_df = spark.sql("SELECT * FROM bronze.raw_data")
raw_df.createOrReplaceTempView("uvw_raw_df")

In [0]:
%sql
CREATE TABLE IF NOT EXISTS cleaned_data (
  invoice_number STRING NOT NULL
  , product_code STRING NOT NULL
  , product_description STRING NOT NULL
  , quantity INTEGER
  , invoice_date TIMESTAMP
  , unit_price DOUBLE
  , customer_id INT
  , country STRING
);

ALTER TABLE cleaned_data DROP CONSTRAINT IF EXISTS quantity_check;
ALTER TABLE cleaned_data ADD CONSTRAINT quantity_check CHECK (quantity >= 0);

ALTER TABLE cleaned_data DROP CONSTRAINT IF EXISTS unit_price_check;
ALTER TABLE cleaned_data ADD CONSTRAINT unit_price_check CHECK (unit_price >= 0);

In [0]:
%sql
WITH CTE_cleaned_data AS (
    SELECT DISTINCT
        COALESCE(InvoiceNo, 'Unknown')                              AS invoice_number
        , COALESCE(StockCode, 'Unknown')                            AS product_code
        , COALESCE(Description, 'Unknown')                          AS product_description
        , COALESCE(Quantity, 0)                                     AS quantity
        , COALESCE(InvoiceDate, '9999-01-01T00:00:00.000+00:00')    AS invoice_date 
        , COALESCE(UnitPrice, 0)                                    AS unit_price
        , COALESCE(CustomerID, 0)                                   AS customer_id
        , COALESCE(Country, 'Unknown')                              AS country
    FROM uvw_raw_df
    WHERE Quantity >= 0 AND UnitPrice >= 0
)

MERGE INTO cleaned_data AS TGT
USING CTE_cleaned_data AS SRC
ON TGT.invoice_number        = SRC.invoice_number
  AND TGT.product_code       = SRC.product_code
  AND TGT.product_description     = SRC.product_description
  AND TGT.quantity        = SRC.quantity
  AND TGT.invoice_date     = SRC.invoice_date
  AND TGT.unit_price       = SRC.unit_price
  AND TGT.customer_id      = SRC.customer_id
  AND TGT.country         = SRC.country
WHEN NOT MATCHED 
THEN INSERT *

In [0]:
%sql
CREATE TABLE IF NOT EXISTS data_quality_checks (
    check_name STRING,
    layer STRING,
    issue_count BIGINT,
    last_checked TIMESTAMP
);


CREATE TABLE IF NOT EXISTS data_quality_alerts (
    alert_id STRING,
    issue STRING,
    layer STRING,
    detected_at TIMESTAMP,
    severity STRING
);

In [0]:
%sql
-- Check for missing Customer IDs
INSERT INTO data_quality_checks (
  check_name
  , layer
  , issue_count
  , last_checked
)
SELECT 
  'Missing CustomerID'
  , 'Silver'
  , COUNT(*)
  , current_timestamp
FROM cleaned_data
WHERE customer_id = 0;

In [0]:
%sql

-- Check for negative or zero prices
INSERT INTO data_quality_checks (
  check_name
  , layer
  , issue_count
  , last_checked
)
SELECT 
  'Negative or Zero Price'
  , 'Silver'
  , COUNT(*)
  , current_timestamp
FROM cleaned_data
WHERE unit_price <= 0;

In [0]:
%sql

-- Check for duplicate invoices
INSERT INTO data_quality_checks (
  check_name
  , layer
  , issue_count
  , last_checked
)
SELECT 'Duplicate Invoices', 'Silver', COUNT(*), current_timestamp
FROM (
    SELECT invoice_number, customer_id, product_code, COUNT(*) AS count
    FROM cleaned_data
    GROUP BY invoice_number, customer_id, product_code
    HAVING COUNT(*) > 1
) duplicates;

In [0]:
%sql
-- Data-Quality-Alerts
-- Insert alerts for major anomalies
INSERT INTO data_quality_alerts (
  alert_id
  , issue
  , layer
  , severity
  , detected_at
)
SELECT 
  UUID()
  , 'High number of missing CustomerIDs'
  , 'Silver'
  , 'High'
  , current_timestamp
WHERE (SELECT COUNT(*) FROM cleaned_data WHERE customer_id IS NULL) > 1000;

In [0]:
%sql

INSERT INTO data_quality_alerts (
  alert_id
  , issue
  , layer
  , severity
  , detected_at
)
SELECT 
  UUID()
  , 'Negative or zero prices detected'
  , 'Silver'
  , 'Medium'
  , current_timestamp
WHERE (SELECT COUNT(*) FROM cleaned_data WHERE unit_price <= 0) > 100;

In [0]:
%sql

INSERT INTO data_quality_alerts (
  alert_id
  , issue
  , layer
  , severity
  , detected_at
)
SELECT 
  UUID()
  , 'Duplicate invoices detected'
  , 'Silver'
  , 'Low'
  , current_timestamp
WHERE (SELECT COUNT(*) FROM (
    SELECT invoice_number, customer_id, product_code, COUNT(*) AS count
    FROM cleaned_data
    GROUP BY invoice_number, customer_id, product_code
    HAVING COUNT(*) > 1
) duplicates) > 50;
