In [1]:
import os
import pyspark.sql.functions as F
from helpers.paths import PathMerger
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

## Configure Spark Context

In [2]:
spark = (SparkSession.builder
         .appName("LoadDatasetsToBronze")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())

In [3]:
# Correct all dates before 1582-10-15 to prolectic gregorian calendar
spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
# spark.conf.set("spark.sql.session.timeZone", "UTC")

## Ingestion Settings

These variables are hard-coded here, but in production, these should be placed into an ETL settings database.

In [4]:
# Map of combinations to create: (db, table)
tables = [
    ('customers', 'customers'), 
    ('customers', 'customer_details'),
    ('devices', 'device_models'),
    ('devices', 'devices')
]

## Load Dataset from Staging

In [33]:
# Empty
tables_saved = []

spark.sql('CREATE DATABASE IF NOT EXISTS bronze')

for db, table in tables:
    
    # Instantiate
    table_path = PathMerger(db, table)
    
    # Load the src table
    df = (
        spark
        .read
        .format("parquet")
        .load(table_path.staging)
        # .withColumn("src_file", F.input_file_name())
        .withColumn("src_batch_id", F.lit(None).cast("integer"))
    )
    
    # Write to BRONZE
    (
        df.repartition(1)
        .write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .save(table_path.bronze)
    )
    
    # TODO: Find a way to get Delta Catalog file paths working on WINDOWS OS.
    #
    # spark.sql(f"CREATE OR REPLACE TABLE {table_path.hive} USING DELTA LOCATION '{table_path.bronze}'")
    
    # For previewing
    tables_saved.append(table_path.bronze)

## Display what was created

In [38]:
for table_path in tables_saved:
    print("\n\n[INFO] LOADING: ", table_path)
    print("=" * 72)
    df = spark.read.format("delta").load(table_path)
    
    df.show(truncate=False)
    df.printSchema()



[INFO] LOADING:  S3\bronze\company_rds\customers\customers
+---+---------------------------------+--------------------------------+-------------------+-------------------+------------+
|id |username                         |password                        |created            |modified           |src_batch_id|
+---+---------------------------------+--------------------------------+-------------------+-------------------+------------+
|0  |janisourander@kamk.fi            |ae6743f425fd616c01f4a046c4873635|1290-01-15 00:00:00|1970-02-20 12:34:56|null        |
|1  |wrivera@lowe.org                 |3e074ece5f07a59ac540bdf80719dd8f|2020-01-15 13:37:00|2020-11-29 19:14:52|null        |
|2  |jill35@walker-thornton.net       |97c19456e44160c1f44729b13ed80a68|2020-01-15 14:11:50|2020-05-23 15:01:53|null        |
|3  |henrystephanie@ortega-mahoney.com|7abb90c2f709cdad1e380072668c00ae|2020-01-15 14:52:26|2020-04-02 23:49:23|null        |
|4  |zwilliams@hotmail.com            |fe31516750fd66761c