# Merge the CDC Files to Bronze

Note that in Databricks, we could use the Auto Loader. In this case, our **readStream** would have additional options:

```python
 .format("cloudFiles")
 .option("cloudFiles.format", "parquet")
 .option("cloudFiles.useNotifications", "true") # Use for SQS/SNS
 .option("cloudFiles.region", "eu-west-1")      # Use for SQS/SNS
```

My thesis includes examples of this in Finnish. This script uses file listing method.

In [1]:
import glob
import os
import pyspark.sql.functions as F
from helpers.paths import PathMerger
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder
         .appName("MergeCDCtoBronze")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Imagine orchestator here

If this was a worker Notebook in Databricks, or a worker Python script orchestrated by Airflow the parameters below would be fed while executing this script.

In [3]:
# Params
db, table = 'devices', 'device_models'
all_pks = ["id"]


# Init
pm = PathMerger(db, table)

In [4]:
print("[INFO] The following Parquet files exist in this staging path: ")

for f in glob.glob(pm.staging + os.sep + "**/*.parquet", recursive=True):
    print(f)

[INFO] The following Parquet files exist in this staging path: 
S3\staging\dms\abc\devices\device_models\LOAD00000001.parquet
S3\staging\dms\abc\devices\device_models\2021\8\19\20210819_104713.parquet


## Load

Based on my testing, the pathGlobFilter applies to the filename, not to the whole path. 

Thus, a glob filter such as...
* `**/*.parquet` returns no files
* `[L]*.parquet` returns all files starting with an `L` letter and ending to `.parquet`.
* `[!L]*.parquet` returns all files NOT starting with an `L` letter.

## Define Functions

In [5]:
# Not to use in production.
df_latest = None

In [6]:
def merge_to_delta(batch_df, batch_id):
    """
    TODO: Write what this does.
    """
    
    # Add op_numeral
    batch_df = (
        batch_df
         .withColumn("op_numeral", F.when(F.col("Op") == "I", 1)
                                     .when(F.col("Op") == "U", 2)
                                     .when(F.col("Op") == "D", 3).cast("int")) # Temporary column
        .withColumn('dms_temp', F.to_timestamp(F.col("dms_timestamp"))) # Temporary column
        # .withColumn("par", F.col("*all_pks[0]) % n_pars)
        .withColumn("src_file", F.input_file_name())
        .withColumn("src_batch_id", F.lit(batch_id))
    )
    
    # Compact to one item per id
    latest_uniques = (
        batch_df
            .selectExpr(*all_pks, "struct(dms_temp, op_numeral, *) as others")
            .groupBy(*all_pks)
            .agg(F.max("others").alias("latest"))
            .select("latest.*")
        )
    
    # Not to use in production
    global df_latest
    df_latest = latest_uniques
    
    # Load Delta Table
    target = DeltaTable.forPath(spark, pm.bronze)
    
    # Using target schema, format to: { "id": "s.id" }
    col_map = {x.name: f"s.{x.name}" for x in target.toDF().schema}
    
    # Format the list of primary keys 
    # into SQL join condition like "t.id = s.id AND t.foo = s.foo"
    join_cond = " AND ".join([f"t.{pk} = s.{pk}" for pk in all_pks])
    
    print(col_map)
    print("Joining with condition: ", join_cond)
    
    (
      target.alias("t")
      .merge(
          latest_uniques.alias("s"),
            f"{join_cond}")
            .whenMatchedDelete(condition = "s.Op = 'D'")
            .whenMatchedUpdate(condition = "s.Op = 'U'", set = col_map)
            .whenNotMatchedInsert(condition = "s.Op != 'D'", values = col_map)
      .execute()
    )    

## Stream

In [11]:
    
# Schema is forced to match the Bronze, but with an extra field. DMS does not add Op to full load.
readers_schema = spark.read.format("delta").load(pm.bronze).schema.add("Op", "string")

# Checkpoints will be written to...
checkpoint_path = os.path.join('S3', 'bronze', '_checkpoints', 'abc', db, table)


# Prepare Spark Auto Loader
df = ( spark.readStream
        .format("parquet")
        .option("recursiveFileLookup", "true")
        .option("pathGlobFilter", "[!L][!O][!A][!D]*.parquet")
        .schema(readers_schema)
        .load(pm.staging)
  )


# Stream
streamingquery = ( 
    df
    .writeStream
    .trigger(once=True)
    .foreachBatch(merge_to_delta)
    # .option("checkpointLocation", checkpoint_path)
    .start()
)
 
streamingquery.awaitTermination()

{'dms_timestamp': 's.dms_timestamp', 'id': 's.id', 'release_date': 's.release_date', 'name': 's.name', 'color': 's.color', 'description': 's.description', 'created': 's.created', 'modified': 's.modified', 'src_batch_id': 's.src_batch_id'}
Joining with condition:  t.id = s.id


StreamingQueryException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "C:\Users\soura\PycharmProjects\opinnaytetyo\venv\lib\site-packages\py4j\java_gateway.py", line 2451, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "C:\Users\soura\SPARK\spark-3.1.2-bin-hadoop2.7\python\pyspark\sql\utils.py", line 196, in call
    raise e
  File "C:\Users\soura\SPARK\spark-3.1.2-bin-hadoop2.7\python\pyspark\sql\utils.py", line 193, in call
    self.func(DataFrame(jdf, self.sql_ctx), batch_id)
  File "C:\Users\soura\AppData\Local\Temp/ipykernel_6544/683105810.py", line 45, in merge_to_delta
    target.alias("t")
  File "C:\Users\soura\AppData\Local\Temp\spark-843ea45a-c4be-48b0-ba79-60abc3fbc2af\userFiles-0d1b016a-1c75-4bf8-97ec-27f60695023f\io.delta_delta-core_2.12-0.8.0.jar\delta\tables.py", line 627, in execute
    self._jbuilder.execute()
  File "C:\Users\soura\PycharmProjects\opinnaytetyo\venv\lib\site-packages\py4j\java_gateway.py", line 1309, in __call__
    return_value = get_return_value(
  File "C:\Users\soura\SPARK\spark-3.1.2-bin-hadoop2.7\python\pyspark\sql\utils.py", line 111, in deco
    return f(*a, **kw)
  File "C:\Users\soura\PycharmProjects\opinnaytetyo\venv\lib\site-packages\py4j\protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o279.execute.
: java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.Alias.<init>(Lorg/apache/spark/sql/catalyst/expressions/Expression;Ljava/lang/String;Lorg/apache/spark/sql/catalyst/expressions/ExprId;Lscala/collection/Seq;Lscala/Option;)V
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$buildTargetPlanWithFiles$1(MergeIntoCommand.scala:577)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.immutable.List.map(List.scala:298)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.buildTargetPlanWithFiles(MergeIntoCommand.scala:569)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$findTouchedFiles$1(MergeIntoCommand.scala:330)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordMergeOperation(MergeIntoCommand.scala:622)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.findTouchedFiles(MergeIntoCommand.scala:306)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2(MergeIntoCommand.scala:267)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2$adapted(MergeIntoCommand.scala:250)
	at org.apache.spark.sql.delta.DeltaLog.withNewTransaction(DeltaLog.scala:188)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$1(MergeIntoCommand.scala:250)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordMergeOperation(MergeIntoCommand.scala:622)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.run(MergeIntoCommand.scala:249)
	at io.delta.tables.DeltaMergeBuilder.$anonfun$execute$1(DeltaMergeBuilder.scala:239)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError(AnalysisHelper.scala:60)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError$(AnalysisHelper.scala:48)
	at io.delta.tables.DeltaMergeBuilder.improveUnsupportedOpError(DeltaMergeBuilder.scala:123)
	at io.delta.tables.DeltaMergeBuilder.execute(DeltaMergeBuilder.scala:228)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


=== Streaming Query ===
Identifier: [id = b74e9320-4b4b-491e-922e-93722068a2ce, runId = 4d9a246f-0833-4aea-9d6b-6dff364a6250]
Current Committed Offsets: {}
Current Available Offsets: {FileStreamSource[file:/C:/Users/soura/PycharmProjects/opinnaytetyo/S3/staging/dms/abc/devices/device_models]: {"logOffset":0}}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
FileStreamSource[file:/C:/Users/soura/PycharmProjects/opinnaytetyo/S3/staging/dms/abc/devices/device_models]

## Examine before-after compaction

In [12]:
df_latest.show(truncate=False)

+-------------------+----------+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+------------+---+----------+-------------------+------------------------------------------------------------------------------------------------------------------------------+
|dms_temp           |op_numeral|dms_timestamp      |id |release_date|name            |color|description|created            |modified           |src_batch_id|Op |op_numeral|dms_temp           |src_file                                                                                                                      |
+-------------------+----------+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+------------+---+----------+-------------------+------------------------------------------------------------------------------------------------------------------------------+
|2021-08-19 10:46:24|1         |2021-08-

In [13]:
# Load Delta Table
target = DeltaTable.forPath(spark, pm.bronze)

# Using target schema, format to: { "id": "s.id" }
col_map = {x.name: f"s.{x.name}" for x in target.toDF().schema}

(
  target.alias("t")
  .merge(
      df_latest.alias("s"),
        "t.id = s.id")
        .whenMatchedDelete(condition = "s.Op = 'D'")
        .whenMatchedUpdate(condition = "s.Op = 'U'", set = col_map)
        .whenNotMatchedInsert(condition = "s.Op != 'D'", values = col_map)
  .execute()
)  

Py4JJavaError: An error occurred while calling o321.execute.
: java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.Alias.<init>(Lorg/apache/spark/sql/catalyst/expressions/Expression;Ljava/lang/String;Lorg/apache/spark/sql/catalyst/expressions/ExprId;Lscala/collection/Seq;Lscala/Option;)V
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$buildTargetPlanWithFiles$1(MergeIntoCommand.scala:577)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.immutable.List.map(List.scala:298)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.buildTargetPlanWithFiles(MergeIntoCommand.scala:569)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$findTouchedFiles$1(MergeIntoCommand.scala:330)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordMergeOperation(MergeIntoCommand.scala:622)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.findTouchedFiles(MergeIntoCommand.scala:306)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2(MergeIntoCommand.scala:267)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2$adapted(MergeIntoCommand.scala:250)
	at org.apache.spark.sql.delta.DeltaLog.withNewTransaction(DeltaLog.scala:188)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$1(MergeIntoCommand.scala:250)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordMergeOperation(MergeIntoCommand.scala:622)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.run(MergeIntoCommand.scala:249)
	at io.delta.tables.DeltaMergeBuilder.$anonfun$execute$1(DeltaMergeBuilder.scala:239)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError(AnalysisHelper.scala:60)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError$(AnalysisHelper.scala:48)
	at io.delta.tables.DeltaMergeBuilder.improveUnsupportedOpError(DeltaMergeBuilder.scala:123)
	at io.delta.tables.DeltaMergeBuilder.execute(DeltaMergeBuilder.scala:228)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [10]:
target.history().show()

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+
|      6|2021-08-22 07:57:...|  null|    null|    WRITE|{mode -> Overwrit...|null|    null|     null|          5|          null|        false|{numFiles -> 1, n...|        null|
|      5|2021-08-22 07:49:...|  null|    null|    WRITE|{mode -> Overwrit...|null|    null|     null|          4|          null|        false|{numFiles -> 1, n...|        null|
|      4|2021-08-22 07:47:...|  null|    null|    WRITE|{mode -> Overwrit...|null|    null|     null|          3|  

## TODO: Examine query progress

In [73]:
streamingquery.lastProgress

In [75]:
streamingquery.status

{'message': 'Terminated with exception: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):\n  File "c:\\users\\soura\\pycharmprojects\\opinnaytetyo\\venv\\lib\\site-packages\\py4j\\java_gateway.py", line 2442, in _call_proxy\n    return_value = getattr(self.pool[obj_id], method)(*params)\n  File "C:\\Users\\soura\\SPARK\\spark-3.1.2-bin-hadoop2.7\\python\\pyspark\\sql\\utils.py", line 196, in call\n    raise e\n  File "C:\\Users\\soura\\SPARK\\spark-3.1.2-bin-hadoop2.7\\python\\pyspark\\sql\\utils.py", line 193, in call\n    self.func(DataFrame(jdf, self.sql_ctx), batch_id)\n  File "<ipython-input-68-4c2a0329a79b>", line 51, in merge_to_delta\n    f"{join_condidion}")\nNameError: name \'join_condidion\' is not defined\n',
 'isDataAvailable': False,
 'isTriggerActive': False}