In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

## Configure Spark Context

In [2]:
spark = (SparkSession.builder
         .appName("GenerateDataset")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())

In [3]:
# Correct all dates before 1582-10-15 to prolectic gregorian calendar
spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
# spark.conf.set("spark.sql.session.timeZone", "UTC")

## Load Dataset from Staging

In [4]:
src_container = 'S3'
src_prefix = 'staging'
src_system = 'alpha'
src_db = 'customers'
src_table = 'customers'

src_dir = os.path.join(src_container, src_prefix, src_system, src_db, src_table)
files_to_load = [os.path.join(src_dir, x) for x in os.listdir(src_dir)]

df = spark.read.format("parquet").load(files_to_load)
#df = spark.read.format("parquet").load('S3/staging/alpha/customers/LOAD00000001.parquet')

In [5]:
df.show(truncate=False)

+---+------------------------+--------------------------------+-------------------+-------------------+
|id |username                |password                        |created            |modified           |
+---+------------------------+--------------------------------+-------------------+-------------------+
|0  |janisourander@kamk.fi   |857bd3cde79f7c0ee5ad7bcc79e1b628|1290-01-15 00:00:00|1970-02-20 12:34:56|
|1  |wendycampbell@barry.com |69a36fde6d009988e284c3029ee8e9f2|2020-01-15 13:37:00|2020-10-04 11:23:54|
|2  |darren62@hotmail.com    |9184e8b7bed887474357d3415c29f9c1|2020-01-15 14:21:57|2020-06-22 16:11:10|
|3  |valerieharper@huber.com |48ca30ac84fb3b395b81f698042ded4d|2020-01-15 14:54:02|2020-05-28 01:50:43|
|4  |darrell22@yahoo.com     |d4ef8f2356efc9bfde24d64a29c87be1|2020-01-15 15:10:26|2020-04-29 21:58:04|
|5  |laurenedwards@smith.com |85bc53f5ee7acb7cd5b8a012257700cd|2020-01-15 15:17:31|2020-02-02 00:20:01|
|6  |sfernandez@quinn.com    |8180c25a9ff5f2c9923eab03be2b8b2d|2

## Write Dataset to Bronze as Delta

In [6]:
tgt_prefix = 'bronze'
tgt_dir = os.path.join(src_container, tgt_prefix, src_system, src_db, src_table)

# Create directory to the Delta Table
#if not os.path.exists(tgt_dir):
#    os.makedirs(tgt_dir)

In [7]:
tgt_dir

'S3\\bronze\\alpha\\customers\\customers'

In [8]:
df.repartition(1).write.mode("overwrite").format("delta").save(tgt_dir)

In [9]:
df2 = spark.read.format("delta").load(tgt_dir)

In [10]:
df2.show()

+---+--------------------+--------------------+-------------------+-------------------+
| id|            username|            password|            created|           modified|
+---+--------------------+--------------------+-------------------+-------------------+
|  0|janisourander@kam...|857bd3cde79f7c0ee...|1290-01-15 00:00:00|1970-02-20 12:34:56|
|  1|wendycampbell@bar...|69a36fde6d009988e...|2020-01-15 13:37:00|2020-10-04 11:23:54|
|  2|darren62@hotmail.com|9184e8b7bed887474...|2020-01-15 14:21:57|2020-06-22 16:11:10|
|  3|valerieharper@hub...|48ca30ac84fb3b395...|2020-01-15 14:54:02|2020-05-28 01:50:43|
|  4| darrell22@yahoo.com|d4ef8f2356efc9bfd...|2020-01-15 15:10:26|2020-04-29 21:58:04|
|  5|laurenedwards@smi...|85bc53f5ee7acb7cd...|2020-01-15 15:17:31|2020-02-02 00:20:01|
|  6|sfernandez@quinn.com|8180c25a9ff5f2c99...|2020-01-15 15:30:01|2020-11-07 18:46:21|
|  7|hannamichael@hotm...|ab7d94d6a3d2145f0...|2020-01-15 16:17:50|2020-11-28 09:15:21|
|  8|danielleshea@hotm...|9e91dc