In [1]:
import os
import pyspark.sql.functions as F
from helpers.paths import PathMerger
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

## Configure Spark Context

If there would be dates prior to 1582-10-15, we would need to correct all dates before that to prolectic gregorian calendar using SparkSession configs:

```python
spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
```

Spark assumes times to be in UTC. To apply tz shift, we can change this to another timezone:
```python
spark.conf.set("spark.sql.session.timeZone", "UTC")
```

In [2]:
spark = (SparkSession.builder
         .appName('LoadDatasetsToBronze')
         .config('spark.jars.packages', 'io.delta:delta-core_2.12:1.0.0')
         .config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension')
         .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog')
         .config('spark.sql.session.timeZone', 'UTC')
         .config('spark.sql.parquet.compression.codec', 'None')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Ingestion Settings

These variables are hard-coded here, but in production, these should be placed into an ETL settings database.

In [3]:
# Map of combinations to create: (source_system, db, table)
tables = [
    ('abc', 'customers', 'customers'), 
    ('abc', 'customers', 'customer_details'),
    ('abc', 'devices', 'device_models'),
    ('abc', 'devices', 'devices')
]

## Load Dataset from Staging

In [4]:
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")

for _, db, table in tables:
    
    # Instantiate
    pm = PathMerger(db, table)
    
    # Load the src table
    df = (
        spark
        .read
        .format('parquet')
        .load(pm.staging)
        # .withColumn('src_file', F.input_file_name())
        .withColumn('src_batch_id', F.lit(None).cast('integer'))
    )
    
    # Write to BRONZE
    (
        df
        .write
        .format('delta')
        .mode('overwrite')
        .option('overwriteSchema', 'true')
        .option('path', os.path.abspath(pm.bronze))
        .saveAsTable(pm.hive)
    )

# Examine the Output

All four tables have been written to bronze database.

In [5]:
spark.sql("SHOW TABLES in bronze").show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
|  bronze|abc_customers_cus...|      false|
|  bronze|abc_customers_cus...|      false|
|  bronze|abc_devices_devic...|      false|
|  bronze| abc_devices_devices|      false|
+--------+--------------------+-----------+



## Select a table for further CDC studies

For the next Notebooks, we will be focusing in the the `device_models` table. Reasoning for this is that the table has been manually written, so it is easy to examine the change data MERGE.

In [6]:
pm = PathMerger('devices', 'device_models')

w = 40

print("The files originated from: ".ljust(w), pm.staging)
print("The Delta table is located at: ".ljust(w), pm.bronze)
print("...and it can be called from Hive as: ".ljust(w), pm.hive)


The files originated from:               S3\staging\dms\abc\devices\device_models
The Delta table is located at:           S3\bronze\abc\devices\device_models
...and it can be called from Hive as:    bronze.abc_devices_devicemodels


## Examine Hive Details

In [7]:
spark.sql(f"DESCRIBE EXTENDED {pm.hive}").toPandas()

Unnamed: 0,col_name,data_type,comment
0,dms_timestamp,string,
1,id,bigint,
2,release_date,date,
3,name,string,
4,color,string,
5,description,string,
6,created,timestamp,
7,modified,timestamp,
8,src_batch_id,int,
9,,,


## Access the table using SQL API

In [8]:
spark.sql(f"SELECT * FROM {pm.hive}").toPandas()

Unnamed: 0,dms_timestamp,id,release_date,name,color,description,created,modified,src_batch_id
0,2021-09-11 11:30:04,1,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,2021-09-11 11:30:04,2,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2021-09-11 11:30:04,3,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,2021-09-11 11:30:04,4,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


## Access the table using Python API

In [9]:
df = spark.table(pm.hive)
# OR spark.read.load(pm.bronze)
# OR spark.sql("SELECT * FROM {pm.hive}")

df.toPandas()

Unnamed: 0,dms_timestamp,id,release_date,name,color,description,created,modified,src_batch_id
0,2021-09-11 11:30:04,1,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,2021-09-11 11:30:04,2,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2021-09-11 11:30:04,3,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,2021-09-11 11:30:04,4,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


## Access data using Delta

In [10]:
dt = DeltaTable.forName(spark, pm.hive)
# OR: DeltaTable.forPath(spark, pm.bronze)

In [11]:
dt.history().toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
0,0,2021-09-12 06:26:03.746,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'pr...",,,,,,False,"{'numOutputRows': '4', 'numOutputBytes': '2569...",
