
#Use necessary catalog and schema

In [0]:
%sql
USE CATALOG online_retail;
CREATE SCHEMA IF NOT EXISTS gold;
USE SCHEMA gold;

#Read From Silver Layer

In [0]:
cleaned_df = spark.sql("SELECT * FROM silver.cleaned_data")
cleaned_df.createOrReplaceTempView("uvw_src_cleaned_df")


#Merge Data into Target Table

In [0]:
%sql
CREATE TABLE IF NOT EXISTS aggregated_data (
  invoice_number STRING NOT NULL
  , product_code STRING NOT NULL
  , product_description STRING NOT NULL
  , total_quantity INTEGER
  , invoice_date TIMESTAMP
  , average_unit_price DOUBLE
  , customer_id INTEGER
  , country STRING
)
TBLPROPERTIES ( 
  'spark.databricks.delta.vacuum.logging.enable' = 'true'
);

ALTER TABLE aggregated_data DROP CONSTRAINT IF EXISTS invoices_pk;
ALTER TABLE aggregated_data ADD CONSTRAINT invoices_pk PRIMARY KEY (invoice_number, product_code);

In [0]:
%sql

--VACUUM aggregated_data RETAIN 720 HOURS;  -- Commented out as it is not needed to run everytime

In [0]:
%sql

WITH CTE_transformed_data AS (
  SELECT 
    invoice_number
    , product_code
    , MAX(LOWER(product_description))   AS product_description                  -- Taking maximum of product_description because found multiple descriptions for same product_code
    , SUM(quantity)                     AS total_quantity                       -- Summing up total_quantity of a same product in an invoice
    , MAX(invoice_date)                 AS invoice_date                         -- Taking maximum of invoice_date because found same invoice has different invoice_dates only with 1 second difference
    , AVG(unit_price)                   AS average_unit_price                   -- Taking average price because one invoice has different prices for same product
    , MAX(customer_id)                  AS customer_id
    , MAX(UPPER(country))               AS country                              -- Converting country to upper case as part of standardization
FROM uvw_src_cleaned_df
GROUP BY invoice_number
        , product_code
)


-- Merge only on invoice_number and product_code as these are the primary keys
MERGE INTO aggregated_data AS TGT
USING CTE_transformed_data AS SRC
ON TGT.invoice_number = SRC.invoice_number
AND TGT.product_code = SRC.product_code
WHEN MATCHED
THEN UPDATE SET *
WHEN NOT MATCHED
THEN INSERT *