In [0]:
%sql
USE CATALOG online_retail;
CREATE SCHEMA IF NOT EXISTS gold;
USE SCHEMA gold;

In [0]:
cleaned_df = spark.sql("SELECT * FROM silver.cleaned_data")
cleaned_df.createOrReplaceTempView("uvw_src_cleaned_df")

In [0]:
%sql
CREATE TABLE IF NOT EXISTS  aggregated_data (
  invoice_number STRING NOT NULL
  , product_code STRING NOT NULL
  , product_description STRING NOT NULL
  , total_quantity INTEGER
  , invoice_date TIMESTAMP
  , average_unit_price DOUBLE
  , customer_id INTEGER
  , country STRING
);

ALTER TABLE aggregated_data DROP CONSTRAINT IF EXISTS invoices_pk;
ALTER TABLE aggregated_data ADD CONSTRAINT invoices_pk PRIMARY KEY (invoice_number, product_code);

In [0]:
%sql
WITH CTE_transformed_data AS (
  SELECT 
    invoice_number
    , product_code
    , MAX(LOWER(product_description))   AS product_description
    , SUM(quantity)                     AS total_quantity
    , MAX(invoice_date)                 AS invoice_date
    , AVG(unit_price)                   AS average_unit_price
    , customer_id
    , UPPER(country)                    AS country
FROM uvw_src_cleaned_df
GROUP BY invoice_number
        , product_code
        , customer_id
        , country
)


MERGE INTO aggregated_data AS TGT
USING CTE_transformed_data AS SRC
ON TGT.invoice_number = SRC.invoice_number
AND TGT.product_code = SRC.product_code
WHEN MATCHED
THEN UPDATE SET *
WHEN NOT MATCHED
THEN INSERT *