In [1]:
import pyspark.sql.functions as F
from helpers.paths import PathMerger
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

## Configure Spark Context

If there would be dates prior to 1582-10-15, we would need to correct all dates before that to prolectic gregorian calendar using SparkSession configs:

```python
spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
```

Spark assumes times to be in UTC. To apply tz shift, we could specify a timezone:
```python
spark.conf.set("spark.sql.session.timeZone", "UTC")
```

In [2]:
spark = (SparkSession.builder
         .appName("LoadDatasetsToBronze")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Ingestion Settings

These variables are hard-coded here, but in production, these should be placed into an ETL settings database.

In [3]:
# Map of combinations to create: (db, table)
tables = [
    ('customers', 'customers'), 
    ('customers', 'customer_details'),
    ('devices', 'device_models'),
    ('devices', 'devices')
]

## Load Dataset from Staging

Note that I have not been able to use the built-in Delta Catalog in a persistent way. This is why I am not saving the data into the Catalog using a command such as:

```python
spark.sql(f"CREATE OR REPLACE TABLE {table_path.hive} USING DELTA LOCATION '{table_path.bronze}'")
```

Instead, I am using a temporary view.

In [4]:
# Empty
tables_saved = []

for db, table in tables:
    
    # Instantiate
    table_path = PathMerger(db, table)
    
    # Load the src table
    df = (
        spark
        .read
        .format("parquet")
        .load(table_path.staging)
        # .withColumn("src_file", F.input_file_name())
        .withColumn("src_batch_id", F.lit(None).cast("integer"))
    )
    
    # Write to BRONZE
    (
        df.repartition(1)
        .write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .save(table_path.bronze)
    )
    
    # For previewing using SQL
    df.createOrReplaceTempView(f"{db}_{table}")
    
    # For previewing using Spark.read or DeltaTable.forPath
    tables_saved.append(table_path.bronze)

## Access created tables using Python API

In [5]:
for table_path in tables_saved:
    print("\n[INFO] LOADING: ", table_path)
    print("=" * 72)
    df = spark.read.format("delta").load(table_path)
    
    display(df.toPandas())
    df.printSchema()


[INFO] LOADING:  S3\bronze\abc\customers\customers


Unnamed: 0,dms_timestamp,id,username,created,modified,src_batch_id
0,2021-08-05 15:19:54,1,janisourander@kamk.fi,1970-01-15 10:00:00,1970-02-20 12:34:56,
1,2021-08-05 15:19:54,1,gabrielmills@gmail.com,2020-01-15 13:37:00,2021-01-07 09:15:05,
2,2021-08-05 15:19:54,2,joseweber@gmail.com,2020-01-15 13:50:08,2020-05-15 17:43:29,
3,2021-08-05 15:19:54,3,kjensen@wade.com,2020-01-15 14:11:15,2020-02-17 17:12:01,
4,2021-08-05 15:19:54,4,alexandrawebb@gmail.com,2020-01-15 15:11:12,2020-07-02 01:06:48,
5,2021-08-05 15:19:54,5,juliecrawford@delgado-adams.com,2020-01-15 15:36:15,2020-09-12 05:02:53,
6,2021-08-05 15:19:54,6,thompsonrachel@morales-brown.com,2020-01-15 16:18:10,2020-09-19 08:25:01,
7,2021-08-05 15:19:54,7,diane74@yahoo.com,2020-01-15 16:42:02,2020-12-23 11:11:17,
8,2021-08-05 15:19:54,8,kimsusan@bradley.info,2020-01-15 17:20:06,2020-07-24 15:52:01,
9,2021-08-05 15:19:54,9,ricardo84@norton.com,2020-01-15 18:18:34,2020-03-24 21:07:23,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- username: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\abc\customers\customer_details


Unnamed: 0,dms_timestamp,id,customer_fk,birthday,language,street_address,postal_code,city,country,phone_number,created,modified,src_batch_id
0,2021-08-05 15:19:54,1,1,1963-12-23,ff,817 Robbins Parkway Suite 056,46333,North Calvin,GW,001-250-991-3804x31652,1970-01-15 10:00:00,1970-01-15 10:00:00,
1,2021-08-05 15:19:54,2,1,1976-09-19,nso,433 Brooks Island Suite 741,43049,Michaelhaven,EE,464-893-6394x1626,2020-01-15 13:37:00,2020-01-15 13:37:00,
2,2021-08-05 15:19:54,3,2,1970-03-30,hsb,87669 Corey Cliffs Apt. 675,91948,Stevensville,DM,+1-475-923-1883x1368,2020-01-15 13:50:08,2020-01-15 13:50:08,
3,2021-08-05 15:19:54,4,3,1975-01-07,gez,8766 Kenneth Route Apt. 484,86596,Port Tamara,UY,790-317-9466x699,2020-01-15 14:11:15,2020-01-15 14:11:15,
4,2021-08-05 15:19:54,5,4,2010-07-05,ps,0545 Morrow Valleys,17678,Hicksville,GD,374-161-5610x0808,2020-01-15 15:11:12,2020-01-15 15:11:12,
5,2021-08-05 15:19:54,6,5,2007-02-03,am,398 William Divide Suite 196,77377,East Kari,TT,821-855-0428x92087,2020-01-15 15:36:15,2020-01-15 15:36:15,
6,2021-08-05 15:19:54,7,6,1932-12-26,lt,2729 Shawna Tunnel,87032,East Jennifershire,KN,710.352.7405,2020-01-15 16:18:10,2020-01-15 16:18:10,
7,2021-08-05 15:19:54,8,7,2014-05-03,mt,4647 Shannon Mountain,66937,Riddlefurt,IL,031-818-5508x0042,2020-01-15 16:42:02,2020-01-15 16:42:02,
8,2021-08-05 15:19:54,9,8,1988-04-24,iu,593 Oneill Loop Apt. 187,73969,Port Amberport,BY,001-348-098-6291x67428,2020-01-15 17:20:06,2020-01-15 17:20:06,
9,2021-08-05 15:19:54,10,9,2018-09-20,lg,911 Crawford Expressway,62185,New Kimfort,CZ,(989)386-1440,2020-01-15 18:18:34,2020-01-15 18:18:34,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- customer_fk: long (nullable = true)
 |-- birthday: date (nullable = true)
 |-- language: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\abc\devices\device_models


Unnamed: 0,dms_timestamp,id,release_date,name,color,description,created,modified,src_batch_id
0,2021-08-05 15:19:54,1,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,2021-08-05 15:19:54,2,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2021-08-05 15:19:54,3,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,2021-08-05 15:19:54,4,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- release_date: date (nullable = true)
 |-- name: string (nullable = true)
 |-- color: string (nullable = true)
 |-- description: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)


[INFO] LOADING:  S3\bronze\abc\devices\devices


Unnamed: 0,dms_timestamp,id,customer_fk,model_fk,serial_number,created,modified,src_batch_id
0,2021-08-05 15:19:54,1,1,1,862-86-8047,1970-01-15 10:18:54,1970-01-15 10:52:11,
1,2021-08-05 15:19:54,2,1,1,329-08-2350,1970-01-15 10:18:54,1970-01-15 10:50:41,
2,2021-08-05 15:19:54,3,1,3,360-73-1379,2020-01-15 14:04:02,2020-01-15 14:47:19,
3,2021-08-05 15:19:54,4,1,3,034-94-0243,2020-01-15 14:04:02,2020-01-15 14:57:24,
4,2021-08-05 15:19:54,5,2,1,688-21-1124,2020-01-15 14:10:11,2020-01-15 14:45:38,
5,2021-08-05 15:19:54,6,2,2,531-52-1018,2020-01-15 14:10:11,2020-01-15 14:59:34,
6,2021-08-05 15:19:54,7,3,4,225-91-3334,2020-01-15 14:14:20,2020-01-15 14:59:47,
7,2021-08-05 15:19:54,8,4,2,339-08-2633,2020-01-15 16:06:51,2020-01-15 16:06:53,
8,2021-08-05 15:19:54,9,5,3,839-26-4038,2020-01-15 16:04:49,2020-01-15 16:36:36,
9,2021-08-05 15:19:54,10,5,1,624-20-4847,2020-01-15 16:04:49,2020-01-15 16:11:13,


root
 |-- dms_timestamp: string (nullable = true)
 |-- id: long (nullable = true)
 |-- customer_fk: long (nullable = true)
 |-- model_fk: long (nullable = true)
 |-- serial_number: string (nullable = true)
 |-- created: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- src_batch_id: integer (nullable = true)



## Access data using SQL

Above, we saved the data as temporary views. These can be accessed using Spark SQL.

In [11]:
tts = spark.sql("SHOW TABLES").collect()[1].tableName

spark.sql(f"SELECT * FROM {tts}").show()

+-------------------+---+--------------------+-------------------+-------------------+------------+
|      dms_timestamp| id|            username|            created|           modified|src_batch_id|
+-------------------+---+--------------------+-------------------+-------------------+------------+
|2021-08-05 15:19:54|  1|janisourander@kam...|1970-01-15 10:00:00|1970-02-20 12:34:56|        null|
|2021-08-05 15:19:54|  1|gabrielmills@gmai...|2020-01-15 13:37:00|2021-01-07 09:15:05|        null|
|2021-08-05 15:19:54|  2| joseweber@gmail.com|2020-01-15 13:50:08|2020-05-15 17:43:29|        null|
|2021-08-05 15:19:54|  3|    kjensen@wade.com|2020-01-15 14:11:15|2020-02-17 17:12:01|        null|
|2021-08-05 15:19:54|  4|alexandrawebb@gma...|2020-01-15 15:11:12|2020-07-02 01:06:48|        null|
|2021-08-05 15:19:54|  5|juliecrawford@del...|2020-01-15 15:36:15|2020-09-12 05:02:53|        null|
|2021-08-05 15:19:54|  6|thompsonrachel@mo...|2020-01-15 16:18:10|2020-09-19 08:25:01|        null|


## Access data using Delta

In [7]:
# Table to open
tto = tables_saved[0]

if DeltaTable.isDeltaTable(spark, tto):
    dt = DeltaTable.forPath(spark, tto)

In [8]:
dt.history().toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
0,0,2021-08-05 12:22:55.685,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,,False,"{'numOutputRows': '10', 'numOutputBytes': '222...",


In [9]:
dt.toDF().show()

+-------------------+---+--------------------+-------------------+-------------------+------------+
|      dms_timestamp| id|            username|            created|           modified|src_batch_id|
+-------------------+---+--------------------+-------------------+-------------------+------------+
|2021-08-05 15:19:54|  1|janisourander@kam...|1970-01-15 10:00:00|1970-02-20 12:34:56|        null|
|2021-08-05 15:19:54|  1|gabrielmills@gmai...|2020-01-15 13:37:00|2021-01-07 09:15:05|        null|
|2021-08-05 15:19:54|  2| joseweber@gmail.com|2020-01-15 13:50:08|2020-05-15 17:43:29|        null|
|2021-08-05 15:19:54|  3|    kjensen@wade.com|2020-01-15 14:11:15|2020-02-17 17:12:01|        null|
|2021-08-05 15:19:54|  4|alexandrawebb@gma...|2020-01-15 15:11:12|2020-07-02 01:06:48|        null|
|2021-08-05 15:19:54|  5|juliecrawford@del...|2020-01-15 15:36:15|2020-09-12 05:02:53|        null|
|2021-08-05 15:19:54|  6|thompsonrachel@mo...|2020-01-15 16:18:10|2020-09-19 08:25:01|        null|
