# Generate Change Data Feed

In [1]:
import os
import pyspark.sql.functions as F

from random import randrange
from helpers.paths import PathMerger
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from datetime import datetime, date, timedelta, timezone

In [2]:
spark = (SparkSession.builder
         .appName('GenerateCDC')
         .config('spark.jars.packages', 'io.delta:delta-core_2.12:1.0.0')
         .config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension')
         .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog')
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Inspect the target Bronze table

We will be using the Python API and relative paths to access the table. Reason for this is that the Hive metastore is not persistent in singlenode environment. In production, one could just call the Hive path when accessing the tables.

To keep this cell idempotent, let's define the Delta time travel to point to the version 0. This is the very first full load in our case. Later, when we perform MERGE operations, we can compare various versions using time travel.

In [3]:
# Init
pm = PathMerger('devices', 'device_models')

# Load
df_tgt = spark.read.format("delta").option("versionAsOf", 0).load(pm.bronze)

# Show
df_tgt.toPandas()

Unnamed: 0,dms_timestamp,id,release_date,name,color,description,created,modified,src_batch_id
0,2021-09-11 11:30:04,1,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,2021-09-11 11:30:04,2,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2021-09-11 11:30:04,3,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,2021-09-11 11:30:04,4,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


## Generate Schema

We are mimicing the schema that AWS DMS is using. In addition to the dms_timestamp that exists also in the full load data, there is now a new column `Op`.

Note that our bronze may have extra columns such as `src_file` or `src_batch_id`.

In [4]:
# Init
schema = StructType()

# Add fields.
schema.add('Op', 'string')
schema.add('dms_timestamp', 'string')
schema.add('id', 'integer')
schema.add('release_date', 'date')
schema.add('name', 'string')
schema.add('color', 'string')
schema.add('description', 'string')
schema.add('created', 'timestamp')
schema.add('modified', 'timestamp')

StructType(List(StructField(Op,StringType,true),StructField(dms_timestamp,StringType,true),StructField(id,IntegerType,true),StructField(release_date,DateType,true),StructField(name,StringType,true),StructField(color,StringType,true),StructField(description,StringType,true),StructField(created,TimestampType,true),StructField(modified,TimestampType,true)))

## Generate DataFrame on INSERTS

In [7]:
def curr(offset=None):
    """
    Params:
      * (Optional) offset: a timedelta object.
      
    Creates a datime object in seconds-precision.
    """

    now = datetime.now()
    
    if offset:
        now = now + offset
    
    return datetime(now.year, now.month, now.day, now.hour, now.minute, now.second)

def datetime_to_dms_str(input_datetime):
    return input_datetime.astimezone(tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')

new_rows = [
    ('I', datetime_to_dms_str(curr()), 5, date(2021, 12, 31), 'Super Gadget 300', 'Black', 'new device', curr(), curr()),
    ('I', datetime_to_dms_str(curr()), 6, date(2021, 12, 31), 'Super Gadget 300', 'Pink', 'new device', curr(), curr()),
]

df_inserts = spark.createDataFrame(new_rows, schema)

In [8]:
df_inserts.show()

+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+
| Op|      dms_timestamp| id|release_date|            name|color|description|            created|           modified|
+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+
|  I|2021-09-11 11:37:17|  5|  2021-12-31|Super Gadget 300|Black| new device|2021-09-11 11:37:17|2021-09-11 11:37:17|
|  I|2021-09-11 11:37:17|  6|  2021-12-31|Super Gadget 300| Pink| new device|2021-09-11 11:37:17|2021-09-11 11:37:17|
+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+



## Generate DataFrame of UPDATES

In [9]:
def modify_row(idx, new_desc, **kwargs):
    new_row = ( 
        df_tgt.where(F.col('id') == idx)
        .withColumn('Op', F.lit('U'))
        .withColumn('dms_timestamp', F.lit(datetime_to_dms_str(curr(**kwargs))))
        .withColumn('description', F.lit(new_desc))
        .withColumn('modified', F.lit(curr(**kwargs)))
        .drop('src_batch_id')
    ).first()
    
    return new_row

In [10]:
new_rows = [
    modify_row(1, 'update id 1'),
    modify_row(2, 'upddddddate id 2'),
    modify_row(2, 'update id 2', offset=timedelta(minutes=1, seconds=42)),
]

df_updates = spark.createDataFrame(new_rows)

# Reorder to match the schema
df_updates = df_updates.select([x.name for x in schema])

In [11]:
df_updates.show()

+---+-------------------+---+------------+----------------+-----+----------------+-------------------+-------------------+
| Op|      dms_timestamp| id|release_date|            name|color|     description|            created|           modified|
+---+-------------------+---+------------+----------------+-----+----------------+-------------------+-------------------+
|  U|2021-09-11 11:37:47|  1|  2010-05-15|Super Gadget 100|  Red|     update id 1|2010-03-21 12:00:01|2021-09-11 11:37:47|
|  U|2021-09-11 11:37:47|  2|  2010-05-15|Super Gadget 100|Black|upddddddate id 2|2010-03-21 12:00:02|2021-09-11 11:37:47|
|  U|2021-09-11 11:39:29|  2|  2010-05-15|Super Gadget 100|Black|     update id 2|2010-03-21 12:00:02|2021-09-11 11:39:29|
+---+-------------------+---+------------+----------------+-----+----------------+-------------------+-------------------+



## Generate DataFrame of DELETES

We will be deleting the Row 5 that was previously created

In [14]:
# The deleted happened some minutes after the insert.
o = timedelta(minutes=5, seconds=13)

# Reference the row to be deleted
x = df_inserts.where(F.col("id") == 5).collect()[0]

new_rows = [
    ('D', datetime_to_dms_str(curr(o)), x.id, x.release_date, x.name, x.color, x.description, x.created, curr(o)),
]

df_deletes = spark.createDataFrame(new_rows, schema)

In [15]:
df_deletes.show()

+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+
| Op|      dms_timestamp| id|release_date|            name|color|description|            created|           modified|
+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+
|  D|2021-09-11 11:43:31|  5|  2021-12-31|Super Gadget 300|Black| new device|2021-09-11 11:37:17|2021-09-11 11:43:31|
+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+



## Union to a single DataFrame

Let's convert the table to Pandas and use its internal writer instead of Spark.

In [16]:
df_all = df_inserts.union(df_updates).union(df_deletes).toPandas()

df_all

Unnamed: 0,Op,dms_timestamp,id,release_date,name,color,description,created,modified
0,I,2021-09-11 11:37:17,5,2021-12-31,Super Gadget 300,Black,new device,2021-09-11 11:37:17,2021-09-11 11:37:17
1,I,2021-09-11 11:37:17,6,2021-12-31,Super Gadget 300,Pink,new device,2021-09-11 11:37:17,2021-09-11 11:37:17
2,U,2021-09-11 11:37:47,1,2010-05-15,Super Gadget 100,Red,update id 1,2010-03-21 12:00:01,2021-09-11 11:37:47
3,U,2021-09-11 11:37:47,2,2010-05-15,Super Gadget 100,Black,upddddddate id 2,2010-03-21 12:00:02,2021-09-11 11:37:47
4,U,2021-09-11 11:39:29,2,2010-05-15,Super Gadget 100,Black,update id 2,2010-03-21 12:00:02,2021-09-11 11:39:29
5,D,2021-09-11 11:43:31,5,2021-12-31,Super Gadget 300,Black,new device,2021-09-11 11:37:17,2021-09-11 11:43:31


## Write

In [17]:
# Get UTC time
now = datetime.now(tz=timezone.utc)

# DMS outputs to subdirs based on ./yyyy/mm/dd/
output_dir = os.path.join(pm.staging, str(now.year), str(now.month), str(now.day))

# File name follows are pattern similar to yyyymmdd_hhmmss.parquet
file_name = now.strftime('%Y%m%d_%H%M%S') + '.parquet'

# Combine
output_path = os.path.join(output_dir, file_name)

# Verbose
print('[INFO] Writing to:', output_path)

# Create directories that do not exist
os.makedirs(output_dir) 

# Write
df_all.to_parquet(output_path)

[INFO] Writing to: S3\staging\dms\abc\devices\device_models\2021\9\11\20210911_113913.parquet
