In [39]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

## Configure Spark Context

In [2]:
spark = (SparkSession.builder
         .appName("GenerateDataset")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())

In [3]:
# Correct all dates before 1582-10-15 to prolectic gregorian calendar
spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
# spark.conf.set("spark.sql.session.timeZone", "UTC")

## Handle Names

In [65]:
class PathMerger:
    """Class for generating staging and bronze paths."""
    
    # Settings that should be loaded from a settings file
    container = 'S3'
    src_prefix = 'staging' 
    brz_prefix = 'bronze'
    tool = 'dms'
    ss = 'company_rds'
    
    # Paths. These will be populated during class init.
    staging = None
    bronze = None
    
    def __init__(self, db, table):
        
        # Fetch input
        self.db = db
        self.table = table
        
        # Populate path variables
        self.generate_paths()
    
    def generate_paths(self):
        self.staging = os.path.join(self.container, self.src_prefix, self.tool, self.ss, self.db, self.table)
        self.bronze = os.path.join(self.container, self.brz_prefix, self.ss, self.db, self.table)

## Load Dataset from Staging

In [66]:
customer = PathMerger('customers', 'customers')

df = spark.read.format("parquet").load(customer.staging)

In [67]:
df.show(truncate=False)

+---+----------------------------+--------------------------------+-------------------+-------------------+
|id |username                    |password                        |created            |modified           |
+---+----------------------------+--------------------------------+-------------------+-------------------+
|0  |janisourander@kamk.fi       |135f027721f9840003efe860b37328e0|1290-01-15 00:00:00|1970-02-20 12:34:56|
|1  |colleen45@yahoo.com         |989e05bb848ad26f37d5d8cf53c60f8f|2020-01-15 13:37:00|2020-06-06 21:33:05|
|2  |emily75@hotmail.com         |f1d5f11cadcb3bee236690e25bdbe362|2020-01-15 14:12:43|2020-09-22 17:37:41|
|3  |chrisfarrell@hotmail.com    |e587e7cb20a566724a0541225b219016|2020-01-15 14:38:55|2020-08-03 11:51:18|
|4  |swells@andrews-rowe.net     |e66fcd21555908d4a4940e6462bba101|2020-01-15 14:42:01|2020-07-30 23:44:39|
|5  |npearson@mathews.com        |d67e01f1df84ca76d18335fdbf55fd10|2020-01-15 15:17:26|2020-08-30 20:21:12|
|6  |grace05@hotmail.com    

## TODO: Write Dataset to Bronze as Delta

In [16]:
# Bronze has similar file naming as the staging, but the tool is unnecessary.
tgt_layer = 'bronze'

tgt_dir = os.path.join(src_container, tgt_layer, src_system, src_db, src_table)

# Create directory to the Delta Table
#if not os.path.exists(tgt_dir):
#    os.makedirs(tgt_dir)

In [17]:
tgt_dir

'S3\\bronze\\company_rds\\customers\\customers'

In [18]:
df.repartition(1).write.mode("overwrite").format("delta").save(tgt_dir)

In [19]:
df2 = spark.read.format("delta").load(tgt_dir)

In [20]:
df2.show()

+---+--------------------+--------------------+-------------------+-------------------+
| id|            username|            password|            created|           modified|
+---+--------------------+--------------------+-------------------+-------------------+
|  0|janisourander@kam...|135f027721f984000...|1290-01-15 00:00:00|1970-02-20 12:34:56|
|  1| colleen45@yahoo.com|989e05bb848ad26f3...|2020-01-15 13:37:00|2020-06-06 21:33:05|
|  2| emily75@hotmail.com|f1d5f11cadcb3bee2...|2020-01-15 14:12:43|2020-09-22 17:37:41|
|  3|chrisfarrell@hotm...|e587e7cb20a566724...|2020-01-15 14:38:55|2020-08-03 11:51:18|
|  4|swells@andrews-ro...|e66fcd21555908d4a...|2020-01-15 14:42:01|2020-07-30 23:44:39|
|  5|npearson@mathews.com|d67e01f1df84ca76d...|2020-01-15 15:17:26|2020-08-30 20:21:12|
|  6| grace05@hotmail.com|6f5915f11d91c2c3a...|2020-01-15 15:35:37|2020-12-20 01:37:41|
|  7|cervantesjonathan...|901ae44b77ce9a6ef...|2020-01-15 16:11:25|2020-01-28 02:38:20|
|  8| longjason@yahoo.com|6b51e4