In [1]:
import os
import pyspark.sql.functions as F
from helpers.paths import PathMerger
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

## Configure Spark Context

If there would be dates prior to 1582-10-15, we would need to correct all dates before that to prolectic gregorian calendar using SparkSession configs:

```python
spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
```

Spark assumes times to be in UTC. To apply tz shift, we could specify a timezone:
```python
spark.conf.set("spark.sql.session.timeZone", "UTC")
```

In [2]:
spark = (SparkSession.builder
         .appName("LoadDatasetsToBronze")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Ingestion Settings

These variables are hard-coded here, but in production, these should be placed into an ETL settings database.

In [3]:
# Map of combinations to create: (db, table)
tables = [
    ('customers', 'customers'), 
    ('customers', 'customer_details'),
    ('devices', 'device_models'),
    ('devices', 'devices')
]

## Load Dataset from Staging

Note that I have not been able to use the built-in Delta Catalog in a persistent way. This is why I am not saving the data into the Catalog using a command such as:

```python
spark.sql(f"CREATE OR REPLACE TABLE {table_path.hive} USING DELTA LOCATION '{table_path.bronze}'")
```

Instead, I am using a temporary view.

In [4]:
# Empty
tables_saved = []

for db, table in tables:
    
    # Instantiate
    table_path = PathMerger(db, table)
    
    # Load the src table
    df = (
        spark
        .read
        .format("parquet")
        .load(table_path.staging)
        # .withColumn("src_file", F.input_file_name())
        .withColumn("src_batch_id", F.lit(None).cast("integer"))
    )
    
    # Write to BRONZE
    (
        df.repartition(1)
        .write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .save(table_path.bronze)
    )
    
    # For previewing using SQL
    df.createOrReplaceTempView(f"{db}_{table}")
    
    # For previewing using Spark.read or DeltaTable.forPath
    tables_saved.append(table_path.bronze)

## Access created tables using Python API

In [5]:
for table_path in tables_saved:
    print("\n[INFO] LOADING: ", table_path)
    print("=" * 72)
    df = spark.read.format("delta").load(table_path)
    
    display(df.toPandas())
    df.printSchema()


[INFO] LOADING:  S3\bronze\abc\customers\customers


Unnamed: 0,dms_timestamp,id,username,created,modified,src_batch_id
0,2021-08-05 09:30:47,0,janisourander@kamk.fi,1970-01-15 10:00:00,1970-02-20 12:34:56,
1,2021-08-05 09:30:47,1,rscott@gmail.com,2020-01-15 13:37:00,2020-02-26 22:00:52,
2,2021-08-05 09:30:47,2,wfletcher@yahoo.com,2020-01-15 14:29:08,2020-02-21 22:57:29,
3,2021-08-05 09:30:47,3,vhoward@gmail.com,2020-01-15 14:41:13,2020-10-27 17:35:24,
4,2021-08-05 09:30:47,4,petersonmelissa@yahoo.com,2020-01-15 14:58:59,2020-02-15 13:58:09,
5,2021-08-05 09:30:47,5,andrew17@lopez.info,2020-01-15 15:36:56,2020-03-09 06:12:44,
6,2021-08-05 09:30:47,6,nkennedy@carson.biz,2020-01-15 15:55:33,2020-01-28 10:53:03,
7,2021-08-05 09:30:47,7,james44@hotmail.com,2020-01-15 16:17:08,2020-03-24 08:49:00,
8,2021-08-05 09:30:47,8,eibarra@yahoo.com,2020-01-15 17:12:11,2020-11-02 20:20:21,
9,2021-08-05 09:30:47,9,ctorres@yahoo.com,2020-01-15 17:36:47,2020-06-04 20:50:32,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- username: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\abc\customers\customer_details


Unnamed: 0,dms_timestamp,id,customer_fk,birthday,language,street_address,postal_code,city,country,phone_number,created,modified,src_batch_id
0,2021-08-05 09:30:47,0,0,1915-02-12,ss,721 Hayes Ridge,21394,Herreraville,EC,698-615-5426x665,1970-01-15 10:00:00,1970-01-15 10:00:00,
1,2021-08-05 09:30:47,1,1,1908-08-07,nr,081 Wyatt Harbors Apt. 671,35887,Hendersontown,MH,045.906.8991x685,2020-01-15 13:37:00,2020-01-15 13:37:00,
2,2021-08-05 09:30:47,2,2,1946-03-10,lzh,17721 Jennifer Cape,13706,Nicholasberg,LV,(881)719-5610,2020-01-15 14:29:08,2020-01-15 14:29:08,
3,2021-08-05 09:30:47,3,3,1985-03-26,vi,039 Jeff Estate,44287,Port Patrick,BY,001-240-687-7797x43638,2020-01-15 14:41:13,2020-01-15 14:41:13,
4,2021-08-05 09:30:47,4,4,1910-03-19,kk,3765 Sarah Meadows,73010,West Meganberg,PT,038-600-1325,2020-01-15 14:58:59,2020-01-15 14:58:59,
5,2021-08-05 09:30:47,5,5,1931-06-09,ce,54293 Wright Ports Apt. 612,47049,Kimchester,US,001-839-632-6952x5447,2020-01-15 15:36:56,2020-01-15 15:36:56,
6,2021-08-05 09:30:47,6,6,1965-02-05,lv,4098 Lacey Mill Apt. 020,22406,Lake Thomasbury,BO,141-356-1342,2020-01-15 15:55:33,2020-01-15 15:55:33,
7,2021-08-05 09:30:47,7,7,2017-11-26,kw,063 Steven Glens,32379,New Carolineview,BJ,+1-069-368-8980x8823,2020-01-15 16:17:08,2020-01-15 16:17:08,
8,2021-08-05 09:30:47,8,8,1971-03-12,gd,195 Stephanie Stream Suite 284,37553,Moraleston,TG,001-402-842-2659x41091,2020-01-15 17:12:11,2020-01-15 17:12:11,
9,2021-08-05 09:30:47,9,9,1980-04-21,bo,9383 Williams Path,60053,South Richardfort,BD,(949)958-5560,2020-01-15 17:36:47,2020-01-15 17:36:47,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- customer_fk: long (nullable = true)
 |-- birthday: date (nullable = true)
 |-- language: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\abc\devices\device_models


Unnamed: 0,dms_timestamp,id,release_date,name,color,description,created,modified,src_batch_id
0,2021-08-05 09:30:47,0,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,2021-08-05 09:30:47,1,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2021-08-05 09:30:47,2,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,2021-08-05 09:30:47,3,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- release_date: date (nullable = true)
 |-- name: string (nullable = true)
 |-- color: string (nullable = true)
 |-- description: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\abc\devices\devices


Unnamed: 0,dms_timestamp,id,customer_fk,model_fk,serial_number,created,modified,src_batch_id
0,2021-08-05 09:30:47,0,0,2,642-89-8554,1970-01-15 10:48:20,1970-01-15 11:08:12,
1,2021-08-05 09:30:47,1,1,3,200-67-0041,2020-01-15 14:18:54,2020-01-15 14:54:32,
2,2021-08-05 09:30:47,2,1,2,789-78-9293,2020-01-15 14:18:54,2020-01-15 14:43:00,
3,2021-08-05 09:30:47,3,2,0,650-22-9565,2020-01-15 14:36:22,2020-01-15 15:18:43,
4,2021-08-05 09:30:47,4,2,1,809-87-5733,2020-01-15 14:36:22,2020-01-15 15:03:44,
5,2021-08-05 09:30:47,5,3,1,854-78-7933,2020-01-15 14:59:05,2020-01-15 15:48:19,
6,2021-08-05 09:30:47,6,3,0,465-68-4500,2020-01-15 14:59:05,2020-01-15 15:17:12,
7,2021-08-05 09:30:47,7,4,1,600-29-9728,2020-01-15 15:02:16,2020-01-15 15:09:16,
8,2021-08-05 09:30:47,8,4,3,485-63-7817,2020-01-15 15:02:16,2020-01-15 15:14:06,
9,2021-08-05 09:30:47,9,5,3,470-20-8539,2020-01-15 16:24:24,2020-01-15 16:56:35,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- customer_fk: long (nullable = true)
 |-- model_fk: long (nullable = true)
 |-- serial_number: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)



## Access data using SQL

Above, we saved the data as temporary views. These can be accessed using Spark SQL.

In [6]:
first_table = spark.sql("SHOW TABLES").collect()[0].tableName

spark.sql(f"SELECT * FROM {first_table}").show()

+-------------------+---+-----------+----------+--------+--------------------+-----------+-----------------+-------+--------------------+-------------------+-------------------+------------+
|      dms_timestamp| id|customer_fk|  birthday|language|      street_address|postal_code|             city|country|        phone_number|            created|           modified|src_batch_id|
+-------------------+---+-----------+----------+--------+--------------------+-----------+-----------------+-------+--------------------+-------------------+-------------------+------------+
|2021-08-05 09:30:47|  0|          0|1915-02-12|      ss|     721 Hayes Ridge|      21394|     Herreraville|     EC|    698-615-5426x665|1970-01-15 10:00:00|1970-01-15 10:00:00|        null|
|2021-08-05 09:30:47|  1|          1|1908-08-07|      nr|081 Wyatt Harbors...|      35887|    Hendersontown|     MH|    045.906.8991x685|2020-01-15 13:37:00|2020-01-15 13:37:00|        null|
|2021-08-05 09:30:47|  2|          2|1946-03-

## Access data using Delta

In [7]:
# Table to open
tto = tables_saved[0]

if DeltaTable.isDeltaTable(spark, tto):
    dt = DeltaTable.forPath(spark, tto)

In [8]:
dt.history().toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
0,0,2021-08-05 06:43:58.342,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,,False,"{'numOutputRows': '10', 'numOutputBytes': '214...",


In [9]:
dt.toDF().show()

+-------------------+---+--------------------+-------------------+-------------------+------------+
|      dms_timestamp| id|            username|            created|           modified|src_batch_id|
+-------------------+---+--------------------+-------------------+-------------------+------------+
|2021-08-05 09:30:47|  0|janisourander@kam...|1970-01-15 10:00:00|1970-02-20 12:34:56|        null|
|2021-08-05 09:30:47|  1|    rscott@gmail.com|2020-01-15 13:37:00|2020-02-26 22:00:52|        null|
|2021-08-05 09:30:47|  2| wfletcher@yahoo.com|2020-01-15 14:29:08|2020-02-21 22:57:29|        null|
|2021-08-05 09:30:47|  3|   vhoward@gmail.com|2020-01-15 14:41:13|2020-10-27 17:35:24|        null|
|2021-08-05 09:30:47|  4|petersonmelissa@y...|2020-01-15 14:58:59|2020-02-15 13:58:09|        null|
|2021-08-05 09:30:47|  5| andrew17@lopez.info|2020-01-15 15:36:56|2020-03-09 06:12:44|        null|
|2021-08-05 09:30:47|  6| nkennedy@carson.biz|2020-01-15 15:55:33|2020-01-28 10:53:03|        null|
