# Merge the CDC Files to Bronze

Note that in Databricks, we could use the Auto Loader. In this case, our **readStream** would have additional options:

```python
 .format("cloudFiles")
 .option("cloudFiles.format", "parquet")
 .option("cloudFiles.useNotifications", "true") # Use for SQS/SNS
 .option("cloudFiles.region", "eu-west-1")      # Use for SQS/SNS
```

My thesis includes examples of this in Finnish. This script uses file listing method.

In [69]:
import glob
import os
import pprint
import pyspark.sql.functions as F
from helpers.paths import PathMerger
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder
         .appName("MergeCDCtoBronze")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Imagine orchestator here

If this was a worker Notebook in Databricks, or a worker Python script orchestrated by Airflow the parameters below would be fed while executing this script.

In [3]:
# Params
db, table = "devices", "device_models"
all_pks = ["id"]

# Init
pm = PathMerger(db, table)

In [5]:
print("[INFO] The following Parquet files exist in this staging path: ")

for f in glob.glob(pm.staging + os.sep + "**/*.parquet", recursive=True):
    print(f)

[INFO] The following Parquet files exist in this staging path: 
S3\staging\dms\abc\devices\device_models\LOAD00000001.parquet
S3\staging\dms\abc\devices\device_models\2021\9\5\20210905_095611.parquet


## Load

Based on my testing, the pathGlobFilter applies to the filename, not to the whole path. 

Thus, a glob filter such as...
* `**/*.parquet` returns no files
* `[L]*.parquet` returns all files starting with an `L` letter and ending to `.parquet`.
* `[!L]*.parquet` returns all files NOT starting with an `L` letter.

## Define Functions

In [134]:
def with_ordering_cols(input_df, batch_id):
    output_df = ( 
        input_df
        .withColumn("op_numeral", F.when(F.col("Op") == "I", 1)
                                     .when(F.col("Op") == "U", 2)
                                     .when(F.col("Op") == "D", 3).cast("int"))
        .withColumn('dms_temp', F.to_timestamp(F.col("dms_timestamp")))
        #.withColumn("par", F.col("*all_pks[0]) % n_pars)
        #.withColumn("src_file", F.input_file_name())
        .withColumn("src_batch_id", F.lit(batch_id).cast("int"))
    )
    return output_df 


def log_compact(input_df, cols_to_drop=["aaa", "bbb"]):
    output_df = (
        input_df
            .selectExpr(*all_pks, "struct(dms_temp as aaa, op_numeral as bbb, *) as others")
            .groupBy(*all_pks)
            .agg(F.max("others").alias("latest"))
            .select("latest.*")
            .drop(*cols_to_drop)
        
    )
    return output_df


def merge_to_delta(batch_df, batch_id):
    
    # Add op_numeral and dms_temp
    batch_df = with_ordering_cols(batch_df, batch_id)
    
    # Compact change log to one item per id
    latest_uniques = log_compact(batch_df)
    
    # Load Delta Table
    target = DeltaTable.forName(spark, pm.hive)
    
    # Using target schema, format to: { "id": "s.id" }
    col_map = {x.name: f"s.{x.name}" for x in target.toDF().schema}
    
    # Format the list of primary keys 
    # into SQL join condition like "t.id = s.id AND t.foo = s.foo"
    join_cond = " AND ".join([f"t.{pk} = s.{pk}" for pk in all_pks])
    
    (
      target.alias("t")
      .merge(
          latest_uniques.alias("s"),
            f"{join_cond}")
            .whenMatchedDelete(condition = "s.Op = 'D'")
            .whenMatchedUpdate(condition = "s.Op = 'U'", set = col_map)
            .whenNotMatchedInsert(condition = "s.Op != 'D'", values = col_map)
      .execute()
    )    

## Restore to VERSION 0

To make sure that this Notebook is idempotent, let's always start from version 0.

Another benefit is that we will have a Hive table available. The Hive in single-node test environment is not persistent, so we will need to create a new database and a new (EXTERNAL) table each time we restart our Python kernel and create a new SparkSession.

In production, this would not be needed.

In [131]:
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")

df_full_load = spark.read.format("delta").option("versionAsOf", 0).load(pm.bronze)

(
    df_full_load
    .write
    .format('delta')
    .mode('overwrite')
    .option('overwriteSchema', 'true')
    .option('path', os.path.abspath(pm.bronze))
    .saveAsTable(pm.hive)
)

## Stream

In [135]:
    
# Schema is forced to match the Bronze, but with an extra field. DMS does not add Op to full load.
readers_schema = spark.read.format("delta").load(pm.bronze).schema.add("Op", "string")

# Checkpoints will be written to...
checkpoint_path = os.path.join('S3', 'bronze', '_checkpoints', 'abc', db, table)


# Prepare Spark Auto Loader
df = ( spark.readStream
        .format("parquet")
        .option("recursiveFileLookup", "true")
        .option("pathGlobFilter", "[!L][!O][!A][!D]*.parquet")
        .schema(readers_schema)
        .load(pm.staging)
  )


# Stream
streamingquery = ( 
    df
    .writeStream
    .trigger(once=True)
    .foreachBatch(merge_to_delta)
    # .option("checkpointLocation", os.path.abspath(checkpoint_path))
    .start()
)
 
streamingquery.awaitTermination()

In [136]:
# Init
pp = pprint.PrettyPrinter()

# Print what the query performed
pp.pprint(streamingquery.lastProgress)

# Use for forging the compacted DataFrame later on
bid = streamingquery.lastProgress['batchId']

{'batchId': 0,
 'durationMs': {'addBatch': 3070,
                'getBatch': 4,
                'latestOffset': 143,
                'queryPlanning': 7,
                'triggerExecution': 3510,
                'walCommit': 141},
 'id': '991ad2c1-b433-4b46-9549-c2f1a780beb7',
 'inputRowsPerSecond': 0.0,
 'name': None,
 'numInputRows': 8,
 'processedRowsPerSecond': 2.2792022792022792,
 'runId': 'e14d412d-9de3-4fae-a148-69682682625b',
 'sink': {'description': 'ForeachBatchSink', 'numOutputRows': -1},
 'sources': [{'description': 'FileStreamSource[file:/C:/Users/soura/PycharmProjects/opinnaytetyo/S3/staging/dms/abc/devices/device_models]',
              'endOffset': {'logOffset': 0},
              'inputRowsPerSecond': 0.0,
              'numInputRows': 8,
              'processedRowsPerSecond': 2.2792022792022792,
              'startOffset': None}],
 'stateOperators': [],
 'timestamp': '2021-09-05T08:36:09.450Z'}


# Examine before-after compaction

### Bronze before Merge

In [137]:
# Show the VERSION 0 - The original FULL LOAD.
df_full_load.toPandas()

Unnamed: 0,dms_timestamp,id,release_date,name,color,description,created,modified,src_batch_id
0,2021-09-04 11:10:50,1,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,2021-09-04 11:10:50,2,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2021-09-04 11:10:50,3,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,2021-09-04 11:10:50,4,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


### CDC Before Log Compaction

In [138]:
# Load the original files from staging.
df_cdc = spark.read.option("recursiveFileLookup", "true").option("pathGlobFilter", "[!L][!O][!A][!D]*.parquet").load(pm.staging)

#Show
df_cdc.toPandas()

Unnamed: 0,Op,dms_timestamp,id,release_date,name,color,description,created,modified
0,I,2021-09-05 09:55:48,5,2021-08-01,Super Gadget 300,Black,new device,2021-09-05 06:55:48,2021-09-05 06:55:48
1,I,2021-09-05 09:55:48,6,2021-08-01,Super Gadget 300,Pink,new device,2021-09-05 06:55:48,2021-09-05 06:55:48
2,U,2021-09-05 09:55:54,1,2010-05-15,Super Gadget 100,Red,update A,2010-03-21 12:00:01,2021-09-05 06:55:54
3,U,2021-09-05 09:55:54,2,2010-05-15,Super Gadget 100,Black,update B,2010-03-21 12:00:02,2021-09-05 06:55:54


### CDC After Log Compaction

In [148]:
# Add ordering columns and perform compaction
df_latest_uniques = log_compact(with_ordering_cols(df_cdc, bid))

# Show
df_latest_uniques.toPandas()

Unnamed: 0,Op,dms_timestamp,id,release_date,name,color,description,created,modified,op_numeral,dms_temp,src_batch_id
0,I,2021-09-05 09:55:48,6,2021-08-01,Super Gadget 300,Pink,new device,2021-09-05 06:55:48,2021-09-05 06:55:48,1,2021-09-05 09:55:48,0
1,I,2021-09-05 09:55:48,5,2021-08-01,Super Gadget 300,Black,new device,2021-09-05 06:55:48,2021-09-05 06:55:48,1,2021-09-05 09:55:48,0
2,U,2021-09-05 09:55:54,1,2010-05-15,Super Gadget 100,Red,update A,2010-03-21 12:00:01,2021-09-05 06:55:54,2,2021-09-05 09:55:54,0
3,U,2021-09-05 09:55:54,2,2010-05-15,Super Gadget 100,Black,update B,2010-03-21 12:00:02,2021-09-05 06:55:54,2,2021-09-05 09:55:54,0


### Final Result

In [146]:
spark.table(pm.hive).show()

+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+------------+
|      dms_timestamp| id|release_date|            name|color|description|            created|           modified|src_batch_id|
+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+------------+
|2021-09-05 09:55:48|  5|  2021-08-01|Super Gadget 300|Black| new device|2021-09-05 06:55:48|2021-09-05 06:55:48|           0|
|2021-09-05 09:55:48|  6|  2021-08-01|Super Gadget 300| Pink| new device|2021-09-05 06:55:48|2021-09-05 06:55:48|           0|
|2021-09-05 09:55:54|  2|  2010-05-15|Super Gadget 100|Black|   update B|2010-03-21 12:00:02|2021-09-05 06:55:54|           0|
|2021-09-05 09:55:54|  1|  2010-05-15|Super Gadget 100|  Red|   update A|2010-03-21 12:00:01|2021-09-05 06:55:54|           0|
|2021-09-04 11:10:50|  4|  2018-05-13|Super Gadget 200|White|lorem ipsum|2018-03-20 12:01:01|2018-03-20 12:01:0

In [156]:
dt = DeltaTable.forName(spark, pm.hive)

dt.history().toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
0,9,2021-09-05 08:36:12.056,,,MERGE,"{'matchedPredicates': '[{""predicate"":""(s.`Op` ...",,,,8.0,,False,"{'numOutputRows': '6', 'numTargetRowsInserted'...",
1,8,2021-09-05 08:33:42.188,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'pr...",,,,7.0,,False,"{'numOutputRows': '4', 'numOutputBytes': '2565...",
2,7,2021-09-05 08:11:36.572,,,MERGE,"{'matchedPredicates': '[{""predicate"":""(s.`Op` ...",,,,6.0,,False,"{'numOutputRows': '6', 'numTargetRowsInserted'...",
3,6,2021-09-05 08:09:46.490,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'pr...",,,,5.0,,False,"{'numOutputRows': '4', 'numOutputBytes': '2565...",
4,5,2021-09-05 07:58:08.388,,,MERGE,"{'matchedPredicates': '[{""predicate"":""(s.`Op` ...",,,,4.0,,False,"{'numOutputRows': '6', 'numTargetRowsInserted'...",
5,4,2021-09-05 07:57:57.204,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'pr...",,,,3.0,,False,"{'numOutputRows': '4', 'numOutputBytes': '2565...",
6,3,2021-09-05 07:55:16.306,,,MERGE,"{'matchedPredicates': '[{""predicate"":""(s.`Op` ...",,,,2.0,,False,"{'numOutputRows': '6', 'numTargetRowsInserted'...",
7,2,2021-09-05 07:49:52.616,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'pr...",,,,1.0,,False,"{'numOutputRows': '4', 'numOutputBytes': '2565...",
8,1,2021-09-05 07:37:56.039,,,MERGE,"{'matchedPredicates': '[{""predicate"":""(s.`Op` ...",,,,0.0,,False,"{'numOutputRows': '6', 'numTargetRowsInserted'...",
9,0,2021-09-05 06:17:53.575,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'pr...",,,,,,False,"{'numOutputRows': '4', 'numOutputBytes': '2569...",
