In [1]:
# Script to initialize the Data Warehouse\Lakehouse and create the required tables

In [1]:
# Import required libraries
from pyspark.sql import SparkSession

In [2]:
# Generate SparkSession
spark = SparkSession \
    .builder \
    .appName("Intialize Lakehouse") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

:: loading settings :: url = jar:file:/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8c71c36c-027a-44cb-8952-070467d32be6;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.1 in central
	found io.delta#delta-storage;2.1.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
	found org.apache.hadoop#hadoop-aws;3.3.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.1026 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 218ms :: artifacts dl 12ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.1026 from central in [default]
	io.delta#delta-core_2.12;2.1.1 from central in [default]
	io.delta#delta-storage;2.1.1 from central in [default]
	org.antlr#antlr4-runtime;4.

In [3]:
# Create dw schema in catalog
spark.sql("create database if not exists dw");
spark.sql("show databases").show();

+---------+
|namespace|
+---------+
|  default|
|       dw|
+---------+



In [4]:
# Create Store Dim table
spark.sql("""drop table if exists dw.dim_store""");

spark.sql("""
create table dw.dim_store (
    row_wid string,
    store_id string,
    store_name string,
    address string,
    city string,
    state string,
    zip_code string,
    phone_number string,
    insert_dt date,
    update_dt date
)
USING delta
;
""");

print("SPARK-APP: Store dimension created")

                                                                                

SPARK-APP: Store dimension created


In [5]:
# Create Plan Type Dimension
spark.sql("""drop table if exists dw.dim_plan_type""");
spark.sql("""
create table dw.dim_plan_type (
    plan_type_code string,
    plan_name string,
    insert_dt date,
    update_dt date
)
USING delta
;
""");

print("SPARK-APP: Plan Type dimension created")

                                                                                

SPARK-APP: Plan Type dimension created


In [6]:
# Create Date Dimension
spark.sql("""drop table if exists dw.dim_date""");
spark.sql("""
create table dw.dim_date (
    row_wid string,
    date date,
    month int,
    year int,
    day_of_week string,
    insert_dt date,
    update_dt date
)
USING delta
;
""");

print("SPARK-APP: Date dimension created")

SPARK-APP: Date dimension created


In [7]:
# Create Product Dimension
spark.sql("""drop table if exists dw.dim_product""");
spark.sql("""
create table dw.dim_product (
    row_wid string,
    product_id string,
    product_name string,
    brand string,
    type string,
    flavor string,
    size string,
    price bigint,
    expiration_dt date,
    image_url string,
    effective_start_dt date,
    effective_end_dt date,
    active_flg int,
    insert_dt date,
    update_dt date
)
USING delta
;
""");

print("SPARK-APP: Product dimension created")

                                                                                

SPARK-APP: Product dimension created


In [8]:
# Create Customer Dimension
spark.sql("""drop table if exists dw.dim_customer""");
spark.sql("""
create table dw.dim_customer (
    row_wid string,
    customer_id string,
    first_name string,
    last_name string,
    address string,
    city string,
    state string,
    zip_code string,
    phone_number string,
    email string,
    date_of_birth date,
    plan_type string,
    effective_start_dt date,
    effective_end_dt date,
    active_flg int,
    insert_dt date,
    update_dt date
)
USING delta
;
""");

print("SPARK-APP: Customer dimension created")

SPARK-APP: Customer dimension created


In [9]:
# Create Sales Fact
spark.sql("""drop table if exists dw.fact_sales""");
spark.sql("""
create table dw.fact_sales (
    date_wid string,
    product_wid string,
    store_wid string,
    customer_wid string,
    order_id string,
    invoice_num string,
    qty int,
    tax double,
    discount double,
    line_total double,
    integration_key string,
    insert_dt date
)
USING delta
;
""");

print("SPARK-APP: Sales Fact created")

SPARK-APP: Sales Fact created


In [10]:
# Log all tables in Data Warehouse/Lakehouse

spark.sql("show tables in dw").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|       dw| dim_customer|      false|
|       dw|     dim_date|      false|
|       dw|dim_plan_type|      false|
|       dw|  dim_product|      false|
|       dw|    dim_store|      false|
|       dw|   fact_sales|      false|
+---------+-------------+-----------+



In [11]:
spark.stop()