
#Use necessary catalog and schema

In [0]:
%sql
USE CATALOG online_retail;
CREATE SCHEMA IF NOT EXISTS gold;
USE gold;


#Read from source tables

In [0]:
product_df= spark.sql("""SELECT DISTINCT 
                        product_code
                        , product_description
                        , average_unit_price 
                        FROM aggregated_data""")

customer_df= spark.sql("""SELECT DISTINCT 
                        customer_id
                        , country 
                        FROM aggregated_data""")           

sale_df= spark.sql("SELECT * FROM aggregated_data")                                     

In [0]:
product_df.createOrReplaceTempView("uvw_src_product_df")
customer_df.createOrReplaceTempView("uvw_src_customer_df")
sale_df.createOrReplaceTempView("uvw_src_sale_df")


#Merge Data into Target Table

In [0]:
%sql
-- Create Products Dimension Table
CREATE TABLE IF NOT EXISTS dim_product (
  dim_product_id BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1) PRIMARY KEY
  , product_code STRING
  , product_description STRING
  , average_unit_price DOUBLE
)
TBLPROPERTIES ( 
  'spark.databricks.delta.vacuum.logging.enable' = 'true'
);

-- Create Customers Dimension Table
CREATE TABLE IF NOT EXISTS dim_customer (
  dim_customer_id BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1) PRIMARY KEY
  , customer_id INTEGER
  , country STRING
)
TBLPROPERTIES ( 
  'spark.databricks.delta.vacuum.logging.enable' = 'true'
);

-- Create Fact Sales Table
CREATE TABLE IF NOT EXISTS fact_sale (
    fct_sale_id BIGINT GENERATED ALWAYS AS IDENTITY (START WITH 1 INCREMENT BY 1) PRIMARY KEY 
    , invoice_number STRING
    , invoice_date TIMESTAMP
    , dim_product_id BIGINT
    , dim_customer_id BIGINT
    , total_quantity INTEGER
    , CONSTRAINT fk_product FOREIGN KEY (dim_product_id) REFERENCES dim_product
    , CONSTRAINT fk_customer FOREIGN KEY (dim_customer_id) REFERENCES dim_customer
)
TBLPROPERTIES ( 
  'spark.databricks.delta.vacuum.logging.enable' = 'true'
);

In [0]:
%sql

--VACUUM dim_product RETAIN 720 HOURS;            -- Commented out as it is not needed to run everytime
--VACUUM dim_customer RETAIN 720 HOURS;           -- Commented out as it is not needed to run everytime
--VACUUM fact_sale RETAIN 720 HOURS;              -- Commented out as it is not needed to run everytime

In [0]:
%sql

-- Insert default rows into products dimension
INSERT INTO dim_product
(
  dim_product_id,
  product_code,
  product_description,
  average_unit_price
)
VALUES(
  -2,
  NULL,
  NULL,
  NULL
),
(
  -1,
  'Unknown',
  'Unknown',
  -1
);

-- Insert default rows into customers dimension
INSERT INTO dim_customer
(
  dim_customer_id,
  customer_id,
  country
)
VALUES(
  -2,
  NULL,
  NULL
),
(
  -1,
  -1,
  'Unknown'
)

In [0]:
%sql

MERGE INTO dim_product AS TGT
USING uvw_src_product_df AS SRC
ON TGT.product_code = SRC.product_code
AND TGT.product_description =  SRC.product_description
AND TGT.average_unit_price = SRC.average_unit_price

WHEN NOT MATCHED
THEN INSERT (
   product_code
  , product_description
  , average_unit_price
)
VALUES (
  SRC.product_code
  , SRC.product_description
  , SRC.average_unit_price
)

In [0]:
%sql

MERGE INTO dim_customer AS TGT
USING uvw_src_customer_df AS SRC
ON TGT.customer_id = SRC.customer_id
AND TGT.country =  SRC.country

WHEN NOT MATCHED
THEN INSERT (
   customer_id
  , country
)
VALUES (
  SRC.customer_id
  , SRC.country
)

In [0]:
%sql

WITH CTE_fact_sale AS (
  SELECT 
    SRCV.invoice_number
    , SRCV.invoice_date
    , P.dim_product_id
    , C.dim_customer_id
    , SRCV.total_quantity
  FROM uvw_src_sale_df SRCV
  LEFT JOIN dim_product AS P 
    ON SRCV.product_code = P.product_code
    AND LOWER(SRCV.product_description) = LOWER(P.product_description)
  LEFT JOIN dim_customer AS C
    ON SRCV.customer_id = C.customer_id
    AND LOWER(SRCV.country) = LOWER(C.country)
)

MERGE INTO fact_sale AS TGT
USING CTE_fact_sale AS SRC
ON TGT.invoice_number = SRC.invoice_number
AND TGT.invoice_date =  SRC.invoice_date
AND TGT.dim_product_id =  SRC.dim_product_id
AND TGT.dim_customer_id =  SRC.dim_customer_id
AND TGT.total_quantity =  SRC.total_quantity

WHEN NOT MATCHED
THEN INSERT (
  invoice_number
  , invoice_date
  , dim_product_id
  , dim_customer_id
  , total_quantity
)
VALUES(
  SRC.invoice_number
  , SRC.invoice_date
  , SRC.dim_product_id
  , SRC.dim_customer_id
  , SRC.total_quantity
)

In [0]:
%sql

OPTIMIZE dim_product
ZORDER BY (product_code);

OPTIMIZE dim_customer
ZORDER BY (customer_id);

OPTIMIZE fact_sale
ZORDER BY (invoice_number);