In [19]:
import pyspark.sql.functions as F
from helpers.paths import PathMerger
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

## Configure Spark Context

If there would be dates prior to 1582-10-15, we would need to correct all dates before that to prolectic gregorian calendar using SparkSession configs:

```python
spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
```

Spark assumes times to be in UTC. To apply tz shift, we could specify a timezone:
```python
spark.conf.set("spark.sql.session.timeZone", "UTC")
```

In [20]:
spark = (SparkSession.builder
         .appName('LoadDatasetsToBronze')
         .config('spark.jars.packages', 'io.delta:delta-core_2.12:0.8.0')
         .config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension')
         .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog')
         .config('spark.sql.session.timeZone', 'UTC')
         .config('spark.sql.parquet.compression.codec', 'None')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Ingestion Settings

These variables are hard-coded here, but in production, these should be placed into an ETL settings database.

In [21]:
# Map of combinations to create: (db, table)
tables = [
    ('customers', 'customers'), 
    ('customers', 'customer_details'),
    ('devices', 'device_models'),
    ('devices', 'devices')
]

## Load Dataset from Staging

Note that I have not been able to use the built-in Delta Catalog in a persistent way. This is why I am not saving the data into the Catalog using a command such as:

```python
spark.sql(f"CREATE OR REPLACE TABLE {table_path.hive} USING DELTA LOCATION '{table_path.bronze}'")
```

Instead, I am using a temporary view.

In [22]:
# Empty
tables_saved = []

for db, table in tables:
    
    # Instantiate
    table_path = PathMerger(db, table)
    
    # Load the src table
    df = (
        spark
        .read
        .format('parquet')
        .load(table_path.staging)
        # .withColumn('src_file', F.input_file_name())
        .withColumn('src_batch_id', F.lit(None).cast('integer'))
    )
    
    # Write to BRONZE
    (
        df
        .write
        .format('delta')
        .mode('overwrite')
        .option('overwriteSchema', 'true')
        .save(table_path.bronze)
    )
    
    # For previewing using SQL
    df.createOrReplaceTempView(f'{db}_{table}')
    
    # For previewing using Spark.read or DeltaTable.forPath
    tables_saved.append(table_path.bronze)

## Access created tables using Python API

In [23]:
for table_path in tables_saved:
    print('\n[INFO] LOADING: ', table_path)
    print('=' * 72)
    df = spark.read.format('delta').load(table_path)
    
    display(df.toPandas())
    df.printSchema()


[INFO] LOADING:  S3\bronze\abc\customers\customers


Unnamed: 0,dms_timestamp,id,username,created,modified,src_batch_id
0,2021-08-15 09:40:46,1,janisourander@kamk.fi,1970-01-15 10:00:00,1970-02-20 12:34:56,
1,2021-08-15 09:40:46,1,tonywilliams@barnes.com,2020-01-15 13:37:00,2020-07-13 08:26:21,
2,2021-08-15 09:40:46,2,cmarsh@yahoo.com,2020-01-15 13:39:56,2020-10-08 10:25:34,
3,2021-08-15 09:40:46,3,carriegreen@marquez-gray.com,2020-01-15 14:00:09,2020-04-27 13:42:13,
4,2021-08-15 09:40:46,4,hthomas@yahoo.com,2020-01-15 14:05:31,2020-08-06 01:27:09,
5,2021-08-15 09:40:46,5,wbishop@hotmail.com,2020-01-15 14:50:35,2020-09-30 20:13:59,
6,2021-08-15 09:40:46,6,steven77@martinez.com,2020-01-15 15:00:34,2020-04-10 23:27:16,
7,2021-08-15 09:40:46,7,smithjames@gmail.com,2020-01-15 15:01:20,2020-12-13 10:11:28,
8,2021-08-15 09:40:46,8,gperez@hotmail.com,2020-01-15 15:24:21,2020-05-09 21:45:26,
9,2021-08-15 09:40:46,9,khale@hotmail.com,2020-01-15 16:04:34,2020-11-29 22:48:03,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- username: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\abc\customers\customer_details


Unnamed: 0,dms_timestamp,id,customer_fk,birthday,language,street_address,postal_code,city,country,phone_number,created,modified,src_batch_id
0,2021-08-15 09:40:46,1,1,1942-09-01,gez,19254 Le Shoal,96842,West Jessicafurt,BG,001-732-790-0908,1970-01-15 10:00:00,1970-01-15 10:00:00,
1,2021-08-15 09:40:46,2,1,1988-09-30,lo,709 Dawn Crescent,47467,Stewartview,KR,+1-702-616-0085,2020-01-15 13:37:00,2020-01-15 13:37:00,
2,2021-08-15 09:40:46,3,2,1960-07-28,en,93565 Jessica Fields Suite 008,97313,Porterport,CN,858.030.0424x156,2020-01-15 13:39:56,2020-01-15 13:39:56,
3,2021-08-15 09:40:46,4,3,1921-01-21,ur,11739 Hernandez Orchard Suite 495,5875,Michaelfurt,HU,001-421-905-1192x1044,2020-01-15 14:00:09,2020-01-15 14:00:09,
4,2021-08-15 09:40:46,5,4,1910-07-21,hr,182 Terry View,67252,Parkborough,AU,999.657.5608,2020-01-15 14:05:31,2020-01-15 14:05:31,
5,2021-08-15 09:40:46,6,5,2019-08-29,mag,369 Smith Forks Suite 570,60978,New Jesse,GY,395-576-0375x91476,2020-01-15 14:50:35,2020-01-15 14:50:35,
6,2021-08-15 09:40:46,7,6,1929-12-16,tg,600 Marvin Rue,647,New Ambershire,LT,2269049584,2020-01-15 15:00:34,2020-01-15 15:00:34,
7,2021-08-15 09:40:46,8,7,2013-05-10,kk,48189 Jessica Alley,25141,North Chadton,MK,5446683628,2020-01-15 15:01:20,2020-01-15 15:01:20,
8,2021-08-15 09:40:46,9,8,1934-11-02,ta,3591 Burton Junctions Suite 749,517,Parkershire,SE,+1-666-343-0512x13500,2020-01-15 15:24:21,2020-01-15 15:24:21,
9,2021-08-15 09:40:46,10,9,1947-09-14,tt,2744 Nicole Port Suite 898,42038,Velezborough,SO,4169241456,2020-01-15 16:04:34,2020-01-15 16:04:34,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- customer_fk: long (nullable = true)
 |-- birthday: date (nullable = true)
 |-- language: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\abc\devices\device_models


Unnamed: 0,dms_timestamp,id,release_date,name,color,description,created,modified,src_batch_id
0,2021-08-15 09:40:46,1,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,2021-08-15 09:40:46,2,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2021-08-15 09:40:46,3,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,2021-08-15 09:40:46,4,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- release_date: date (nullable = true)
 |-- name: string (nullable = true)
 |-- color: string (nullable = true)
 |-- description: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\abc\devices\devices


Unnamed: 0,dms_timestamp,id,customer_fk,model_fk,serial_number,created,modified,src_batch_id
0,2021-08-15 09:40:46,1,1,2,846-09-7018,1970-01-15 10:40:34,1970-01-15 11:19:42,
1,2021-08-15 09:40:46,2,1,4,814-78-8514,1970-01-15 10:40:34,1970-01-15 11:25:19,
2,2021-08-15 09:40:46,3,1,4,305-68-2901,2020-01-15 14:02:05,2020-01-15 14:33:37,
3,2021-08-15 09:40:46,4,1,1,064-41-8841,2020-01-15 14:02:05,2020-01-15 14:28:25,
4,2021-08-15 09:40:46,5,2,4,755-25-7677,2020-01-15 13:40:13,2020-01-15 13:48:17,
5,2021-08-15 09:40:46,6,2,2,827-85-3026,2020-01-15 13:40:13,2020-01-15 13:59:16,
6,2021-08-15 09:40:46,7,3,4,838-29-0909,2020-01-15 14:02:51,2020-01-15 14:50:53,
7,2021-08-15 09:40:46,8,3,3,090-39-2995,2020-01-15 14:02:51,2020-01-15 15:02:30,
8,2021-08-15 09:40:46,9,4,2,388-11-5076,2020-01-15 14:29:49,2020-01-15 14:34:36,
9,2021-08-15 09:40:46,10,5,2,268-23-3325,2020-01-15 15:06:57,2020-01-15 16:05:29,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- customer_fk: long (nullable = true)
 |-- model_fk: long (nullable = true)
 |-- serial_number: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)



## Access data using SQL

Above, we saved the data as temporary views. These can be accessed using Spark SQL.

In [24]:
tts = spark.sql("SHOW TABLES").collect()[1].tableName

spark.sql(f"SELECT * FROM {tts}").show()

+-------------------+---+--------------------+-------------------+-------------------+------------+
|      dms_timestamp| id|            username|            created|           modified|src_batch_id|
+-------------------+---+--------------------+-------------------+-------------------+------------+
|2021-08-15 09:40:46|  1|janisourander@kam...|1970-01-15 10:00:00|1970-02-20 12:34:56|        null|
|2021-08-15 09:40:46|  1|tonywilliams@barn...|2020-01-15 13:37:00|2020-07-13 08:26:21|        null|
|2021-08-15 09:40:46|  2|    cmarsh@yahoo.com|2020-01-15 13:39:56|2020-10-08 10:25:34|        null|
|2021-08-15 09:40:46|  3|carriegreen@marqu...|2020-01-15 14:00:09|2020-04-27 13:42:13|        null|
|2021-08-15 09:40:46|  4|   hthomas@yahoo.com|2020-01-15 14:05:31|2020-08-06 01:27:09|        null|
|2021-08-15 09:40:46|  5| wbishop@hotmail.com|2020-01-15 14:50:35|2020-09-30 20:13:59|        null|
|2021-08-15 09:40:46|  6|steven77@martinez...|2020-01-15 15:00:34|2020-04-10 23:27:16|        null|


## Access data using Delta

In [25]:
# Table to open
tto = tables_saved[0]

if DeltaTable.isDeltaTable(spark, tto):
    dt = DeltaTable.forPath(spark, tto)

In [26]:
dt.history().toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
0,0,2021-08-22 07:57:22.399,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,,False,"{'numOutputRows': '10', 'numOutputBytes': '223...",


In [27]:
dt.toDF().show()

+-------------------+---+--------------------+-------------------+-------------------+------------+
|      dms_timestamp| id|            username|            created|           modified|src_batch_id|
+-------------------+---+--------------------+-------------------+-------------------+------------+
|2021-08-15 09:40:46|  1|janisourander@kam...|1970-01-15 10:00:00|1970-02-20 12:34:56|        null|
|2021-08-15 09:40:46|  1|tonywilliams@barn...|2020-01-15 13:37:00|2020-07-13 08:26:21|        null|
|2021-08-15 09:40:46|  2|    cmarsh@yahoo.com|2020-01-15 13:39:56|2020-10-08 10:25:34|        null|
|2021-08-15 09:40:46|  3|carriegreen@marqu...|2020-01-15 14:00:09|2020-04-27 13:42:13|        null|
|2021-08-15 09:40:46|  4|   hthomas@yahoo.com|2020-01-15 14:05:31|2020-08-06 01:27:09|        null|
|2021-08-15 09:40:46|  5| wbishop@hotmail.com|2020-01-15 14:50:35|2020-09-30 20:13:59|        null|
|2021-08-15 09:40:46|  6|steven77@martinez...|2020-01-15 15:00:34|2020-04-10 23:27:16|        null|
