
#Read the raw csv file into a DataFrame

In [0]:
raw_df = spark.read.option("header", "true").option("inferschema", "true").csv("/Volumes/online_retail/bronze/raw/online_retail.csv")


#Use the necessary catalog and schema

In [0]:
%sql
USE CATALOG online_retail;
CREATE SCHEMA IF NOT EXISTS bronze;
USE SCHEMA bronze;


#Merge Data into Target Table

In [0]:
%sql
CREATE TABLE IF NOT EXISTS raw_data (
  InvoiceNo STRING
  , StockCode STRING
  , Description STRING
  , Quantity INTEGER
  , InvoiceDate TIMESTAMP
  , UnitPrice DOUBLE
  , CustomerID DOUBLE
  , Country STRING
)
TBLPROPERTIES ( 
  'spark.databricks.delta.vacuum.logging.enable' = 'true'
  , 'spark.databricks.delta.liquidCluster.enabled' = 'true'
)

In [0]:
%sql

--VACUUM raw_data RETAIN 168 HOURS;       -- Commented out as it is not needed to run everytime

In [0]:
raw_df.createOrReplaceTempView("uvw_src_raw")

In [0]:
%sql

-- Merging on all columns because we want to load all the records
MERGE INTO raw_data AS TGT
USING uvw_src_raw AS SRC
ON TGT.InvoiceNo        = SRC.InvoiceNo
  AND TGT.StockCode       = SRC.StockCode
  AND TGT.Description     = SRC.Description
  AND TGT.Quantity        = SRC.Quantity
  AND TGT.InvoiceDate     = SRC.InvoiceDate
  AND TGT.UnitPrice       = SRC.UnitPrice
  AND TGT.CustomerID      = SRC.CustomerID
  AND TGT.Country         = SRC.Country
WHEN NOT MATCHED 
  THEN INSERT *