In [1]:
import os
import pyspark.sql.functions as F
from helpers.paths import PathMerger
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

## Configure Spark Context

If there would be dates prior to 1582-10-15, we would need to correct all dates before that to prolectic gregorian calendar using SparkSession configs:

```python
spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
```

Spark assumes times to be in UTC. To apply tz shift, we could specify a timezone:
```python
spark.conf.set("spark.sql.session.timeZone", "UTC")
```

In [2]:
spark = (SparkSession.builder
         .appName("LoadDatasetsToBronze")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Ingestion Settings

These variables are hard-coded here, but in production, these should be placed into an ETL settings database.

In [3]:
# Map of combinations to create: (db, table)
tables = [
    ('customers', 'customers'), 
    ('customers', 'customer_details'),
    ('devices', 'device_models'),
    ('devices', 'devices')
]

## Load Dataset from Staging

Note that I have not been able to use the built-in Delta Catalog in a persistent way. This is why I am not saving the data into the Catalog using a command such as:

```python
spark.sql(f"CREATE OR REPLACE TABLE {table_path.hive} USING DELTA LOCATION '{table_path.bronze}'")
```

Instead, I am using a temporary view.

In [4]:
# Empty
tables_saved = []

for db, table in tables:
    
    # Instantiate
    table_path = PathMerger(db, table)
    
    # Load the src table
    df = (
        spark
        .read
        .format("parquet")
        .load(table_path.staging)
        # .withColumn("src_file", F.input_file_name())
        .withColumn("src_batch_id", F.lit(None).cast("integer"))
    )
    
    # Write to BRONZE
    (
        df.repartition(1)
        .write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .save(table_path.bronze)
    )
    
    # For previewing using SQL
    df.createOrReplaceTempView(f"{db}_{table}")
    
    # For previewing using Spark.read or DeltaTable.forPath
    tables_saved.append(table_path.bronze)

## Access created tables using Python API

In [5]:
for table_path in tables_saved:
    print("\n[INFO] LOADING: ", table_path)
    print("=" * 72)
    df = spark.read.format("delta").load(table_path)
    
    display(df.toPandas())
    df.printSchema()


[INFO] LOADING:  S3\bronze\company_rds\customers\customers


Unnamed: 0,id,username,password,created,modified,src_batch_id
0,0,janisourander@kamk.fi,873619955484beec8f72b34ae8afa995,1970-01-15 10:00:00,1970-02-20 12:34:56,
1,1,meadowsmary@hotmail.com,584d1b0045918409f3e2d37b955fe86e,2020-01-15 13:37:00,2020-11-04 01:27:47,
2,2,nicolevelasquez@wells.com,9154f32482ea97480b58dc53bbd401d6,2020-01-15 14:30:49,2020-07-20 09:50:47,
3,3,brownamber@liu.com,66767e428490d1b3f911830241b6f4d4,2020-01-15 15:07:30,2020-11-04 10:08:36,
4,4,jennifer81@gmail.com,2b302950516a5a022ddb3932be3ad8e9,2020-01-15 16:01:48,2020-10-05 13:01:52,
5,5,richard48@rodriguez.info,07066cf0cff68b165211648102f46b18,2020-01-15 16:27:24,2020-10-27 19:32:08,
6,6,maria19@gmail.com,4e059cb3b1bede1b7d8077c422541678,2020-01-15 17:18:51,2020-11-28 18:50:21,
7,7,alexanderpalmer@miles-briggs.biz,c1b82c28099c44884230cfae5233cf91,2020-01-15 17:42:46,2020-11-17 17:56:10,
8,8,antonioshaw@hatfield.com,426aecbf939872f0b2ac67201a35452c,2020-01-15 18:26:36,2020-11-08 14:26:26,
9,9,christina03@payne.net,01077914a3eb23a3ae2ce51d8063330d,2020-01-15 19:11:43,2020-08-05 21:24:51,


root
 |-- id: long (nullable = true)
 |-- username: string (nullable = true)
 |-- password: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\company_rds\customers\customer_details


Unnamed: 0,id,release_date,name,color,description,created,modified,src_batch_id
0,0,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,1,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,3,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


root
 |-- id: long (nullable = true)
 |-- release_date: date (nullable = true)
 |-- name: string (nullable = true)
 |-- color: string (nullable = true)
 |-- description: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\company_rds\devices\device_models


Unnamed: 0,id,release_date,name,color,description,created,modified,src_batch_id
0,0,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,1,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,3,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


root
 |-- id: long (nullable = true)
 |-- release_date: date (nullable = true)
 |-- name: string (nullable = true)
 |-- color: string (nullable = true)
 |-- description: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\company_rds\devices\devices


Unnamed: 0,id,release_date,name,color,description,created,modified,src_batch_id
0,0,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,1,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,3,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


root
 |-- id: long (nullable = true)
 |-- release_date: date (nullable = true)
 |-- name: string (nullable = true)
 |-- color: string (nullable = true)
 |-- description: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)



## Access data using SQL

Above, we saved the data as temporary views. These can be accessed using Spark SQL.

In [6]:
first_table = spark.sql("SHOW TABLES").collect()[0].tableName

spark.sql(f"SELECT * FROM {first_table}").show()

+---+------------+----------------+-----+-----------+-------------------+-------------------+------------+
| id|release_date|            name|color|description|            created|           modified|src_batch_id|
+---+------------+----------------+-----+-----------+-------------------+-------------------+------------+
|  0|  2010-05-15|Super Gadget 100|  Red|lorem ipsum|2010-03-21 12:00:01|2010-03-21 12:00:01|        null|
|  1|  2010-05-15|Super Gadget 100|Black|lorem ipsum|2010-03-21 12:00:02|2010-03-21 12:00:02|        null|
|  2|  2010-11-01|Super Gadget 100| Pink|lorem ipsum|2010-08-05 07:00:00|2010-08-05 07:00:00|        null|
|  3|  2018-05-13|Super Gadget 200|White|lorem ipsum|2018-03-20 12:01:01|2018-03-20 12:01:01|        null|
+---+------------+----------------+-----+-----------+-------------------+-------------------+------------+



## Access data using Delta

In [7]:
# Table to open
tto = tables_saved[0]

if DeltaTable.isDeltaTable(spark, tto):
    dt = DeltaTable.forPath(spark, tto)

In [8]:
dt.history().toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
0,0,2021-08-04 11:32:03.468,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,,False,"{'numOutputRows': '10', 'numOutputBytes': '248...",


In [9]:
dt.toDF().show()

+---+--------------------+--------------------+-------------------+-------------------+------------+
| id|            username|            password|            created|           modified|src_batch_id|
+---+--------------------+--------------------+-------------------+-------------------+------------+
|  0|janisourander@kam...|873619955484beec8...|1970-01-15 10:00:00|1970-02-20 12:34:56|        null|
|  1|meadowsmary@hotma...|584d1b0045918409f...|2020-01-15 13:37:00|2020-11-04 01:27:47|        null|
|  2|nicolevelasquez@w...|9154f32482ea97480...|2020-01-15 14:30:49|2020-07-20 09:50:47|        null|
|  3|  brownamber@liu.com|66767e428490d1b3f...|2020-01-15 15:07:30|2020-11-04 10:08:36|        null|
|  4|jennifer81@gmail.com|2b302950516a5a022...|2020-01-15 16:01:48|2020-10-05 13:01:52|        null|
|  5|richard48@rodrigu...|07066cf0cff68b165...|2020-01-15 16:27:24|2020-10-27 19:32:08|        null|
|  6|   maria19@gmail.com|4e059cb3b1bede1b7...|2020-01-15 17:18:51|2020-11-28 18:50:21|    