# 01-Load a staging table
This notebook extracts data from source system files into a staging table.
In this example we will extract raw sales data stored in parquet files into a delta lake table
This notebook executes some popular transformations you will encounter in real-life scenarios
					
## Contents
1. Extract
1. Transform
1. Load

In [1]:
# Set Parameters

# Set path to source files
basePath = "abfss://synapse@oneclickpocadls.dfs.core.windows.net/AdventureWorksDW2019/dbo/"
filePath = "FactInternetSales"


### 1. Extract

In [2]:
# Create a spark dataframe with raw data
rawDF = spark.read.parquet(basePath + filePath)

# If the source file format is csv or json
# df = spark.read.option("header","true").option("inferSchema","true").csv(filePath)
# df = spark.read.json(filePath)

display(rawDF.limit(10))

In [26]:
# Run some exploratory data analysis
rawDF.count()

In [27]:
# Summary statistics on a sample of 1000 rows
display(rawDF.limit(100).summary())

In [28]:

# Check the data types of the source data fields
rawDF.printSchema()

### 2. Transform

In [31]:
# This is a great way to test out your transformations.
# Display just the field with the transformations on a sample of 1000 rows
from pyspark.sql.functions import *
display(rawDF.limit(1000).select(substring(col("OrderDate").cast("String"),1,10)))

In [33]:
# Always import these two sets of libraries at a minimum for spark transformations
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Set the date format
dateFormat = "yyyy-M-d"

# Executed the following transformations
# 1. Select a subset of fields from source
# 2. Rename fields to more readable names
# 3. Change data types
# 4. Concat fields to create the birth date
# 5. Filter Rows
cleanDF = rawDF.select(\
             #Date and time of the event
             col("CustomerKey").alias("customer_sk"),
             col("ProductKey").alias("product_sk"),
             col("OrderDateKey").alias("order_date_sk"),
             col("ShipDateKey").alias("ship_date_sk"),
             col("DueDateKey").alias("due_date_sk"),
             col("PromotionKey").alias("promotion_sk"),
             col("CurrencyKey").alias("currency_sk"),
             col("SalesTerritoryKey").alias("sales_territory_sk"),
             col("SalesOrderNumber").alias("sales_order_no"),
             col("SalesOrderLineNumber").alias("sales_order_line_no"),
             col("RevisionNumber").alias("revision_no"),
             col("OrderQuantity").alias("order_qty"),
             #col("HouseOwnerFlag").cast(BooleanType()).alias("house_owner_flag"),
             to_date(substring(col("OrderDate").cast("String"),1,10),dateFormat).alias("order_date"), \
             col("SalesAmount").alias("sales_amount"),
             col("DiscountAmount").alias("discount_amount"), \
             col("Freight").alias("freight") \
            ) \
            .filter("ExtendedAmount > 0")

display(cleanDF.limit(100))

In [34]:
cleanDF.count()

In [35]:
cleanDF.printSchema()

In [36]:
# Creating a temporary in memory table helps with further processing using SQL syntax
# Transforming data using Scala or Python or SQL DOES NOT affect the performance of the processing
# Ultimately all transformations are optimized by Spark and operated using RDDs

cleanDF.createOrReplaceTempView("clean_tmp")

### 3. Load

In [12]:
%%sql
-- Ideally you want to specify where this database will be stored by using LOCATION parameter
-- CREATE DATABASE sparklakehouse LOCATION 'abfss://synapse@oneclickpocadls.dfs.core.windows.net/tpcds1tbparquet/'
CREATE DATABASE IF NOT EXISTS sparklakehouse

In [43]:
#%%sql
#DROP TABLE sparklakehouse.stg_internet_sales

In [38]:
%%sql
-- Creating spark tables using delta format allow ACID transactions on data lake tables
-- This is a one-time task
CREATE TABLE IF NOT EXISTS sparklakehouse.stg_internet_sales USING DELTA
AS
SELECT * FROM clean_tmp

In [39]:
%%sql
-- We can merge only changes into the target table
-- This cell should be commented out during the initial load
MERGE INTO sparklakehouse.stg_internet_sales t
USING clean_tmp s 
ON t.customer_sk = s.customer_sk 
and t.product_sk = s.product_sk
and t.order_date_sk = s.order_date_sk
and t.ship_date_sk = s.ship_date_sk
and t.due_date_sk = s.due_date_sk
and t.promotion_sk = s.promotion_sk
and t.currency_sk = s.currency_sk
and t.promotion_sk = s.promotion_sk
and t.sales_order_no = s.sales_order_no
and t.sales_order_line_no = s.sales_order_line_no
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *


In [40]:
%%sql
VACUUM sparklakehouse.stg_internet_sales RETAIN 168 HOURS;

In [42]:
%%sql
DESCRIBE DETAIL sparklakehouse.stg_internet_sales