# Merge the CDC Files to Bronze

Note that in Databricks, we could use the Auto Loader. In this case, our **readStream** would have additional options:

```python
 .format("cloudFiles")
 .option("cloudFiles.format", "parquet")
 .option("cloudFiles.useNotifications", "true") # Use for SQS/SNS
 .option("cloudFiles.region", "eu-west-1")      # Use for SQS/SNS
```

My thesis includes examples of this in Finnish. This script uses file listing method.

In [29]:
import glob
import os
import pyspark.sql.functions as F
from helpers.paths import PathMerger
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder
         .appName("MergeCDCtoBronze")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0")
         .config('spark.sql.extensions', "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Imagine orchestator here

If this was a worker Notebook in Databricks, or a worker Python script orchestrated by Airflow the parameters below would be fed while executing this script.

In [96]:
# Params
db, table = 'devices', 'device_models'
all_pks = ["id"]


# Init
pm = PathMerger(db, table)

In [95]:
print("[INFO] The following Parquet files exist in this staging path: ")

for f in glob.glob(pm.staging + os.sep + "**/*.parquet", recursive=True):
    print(f)

[INFO] The following Parquet files exist in this staging path: 
S3\staging\dms\abc\devices\device_models\LOAD00000001.parquet
S3\staging\dms\abc\devices\device_models\2021\8\19\20210819_104713.parquet


## Load

Based on my testing, the pathGlobFilter applies to the filename, not to the whole path. 

Thus, a glob filter such as...
* `**/*.parquet` returns no files
* `[L]*.parquet` returns all files starting with an `L` letter and ending to `.parquet`.
* `[!L]*.parquet` returns all files NOT starting with an `L` letter.

In [92]:
input_df = None
merge_df = None

def merge_to_delta(batch_df, batch_id):
    """
    TODO: Write what this does.
    """
    
    # Not to use in production. Steal the DataFrame so that we can investigate it outside this function.
    global input_df
    input_df = batch_df
    
    # Add op_numeral
    df_batch = (
        m_df
         .withColumn("op_numeral", F.when(F.col("Op") == "I", 1)
                                     .when(F.col("Op") == "U", 2)
                                     .when(F.col("Op") == "D", 3).cast("int")) # Temporary column
        .withColumn('dms_temp', F.to_timestamp(F.col("dms_timestamp"))) # Temporary column
        # .withColumn("par", F.col("*all_pks[0]) % n_pars)
        # .withColumn("src_file", F.input_file_name())
        .withColumn("src_batch_id", F.lit(batch_id))
    )

    # These two, as well as Op, are not available in the target Delta Table. Mark as to-be-dropped.
    cols_to_drop = ["dms_temp", "op_numeral"]
    
    latest_uniques = (
        df_batch
            .selectExpr(*all_pks, "struct(dms_temp, op_numeral, *) as others")
            .groupBy(*all_pks)
            .agg(F.max("others").alias("latest"))
            .select("latest.*")
            .drop(*cols_to_drop)
        )


In [None]:
    
# Schema is forced to match the Bronze, but with an extra field. DMS does not add Op to full load.
readers_schema = spark.read.format("delta").load(pm.bronze).schema.add("Op", "string")

# Checkpoints will be written to...
checkpoint_path = os.path.join('S3', 'bronze', '_checkpoints', 'abc', db, table)
    
# Prepare Spark Auto Loader
df = ( spark.readStream
        .format("parquet")
        .option("recursiveFileLookup", "true")
        .option("pathGlobFilter", "[!L][!O][!A][!D]*.parquet")
        .schema(readers_schema)
        .load(pm.staging)
  )


# Merge to Bronze


streamingquery = ( 
    df
    .writeStream
    .trigger(once=True)
    .foreachBatch(merge_to_delta)
    # .option("checkpointLocation", checkpoint_path)
    .start()
)
 
streamingquery.awaitTermination()

## Examine before-after compaction

## TODO: Examine query progress

In [75]:
streamingquery.lastProgress

{'id': '78c08a94-cc04-4cf0-86d7-0991ce907e96',
 'runId': 'e23cad9b-d0a4-4f6e-8d69-551bc16e224c',
 'name': None,
 'timestamp': '2021-08-19T07:35:59.064Z',
 'batchId': 0,
 'numInputRows': 8,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 4.640371229698376,
 'durationMs': {'addBatch': 252,
  'getBatch': 16,
  'latestOffset': 538,
  'queryPlanning': 0,
  'triggerExecution': 1724,
  'walCommit': 424},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[file:/C:/Users/soura/PycharmProjects/opinnaytetyo/S3/staging/dms/abc/devices/device_models]',
   'startOffset': None,
   'endOffset': {'logOffset': 0},
   'numInputRows': 8,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 4.640371229698376}],
 'sink': {'description': 'ForeachBatchSink', 'numOutputRows': -1}}