# Generate Change Data Feed

In [12]:
import os
import pyspark.sql.functions as F

from random import randrange
from helpers.paths import PathMerger
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from datetime import datetime, date

In [13]:
spark = (SparkSession.builder
         .appName('GenerateCDC')
         .config('spark.jars.packages', 'io.delta:delta-core_2.12:0.8.0')
         .config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension')
         .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog')
         .config('spark.sql.session.timeZone', 'UTC')
         .getOrCreate())


# This cannot be imported before initializing the SparkSession.
from delta import DeltaTable

## Inspect the target Bronze table

In [14]:
# Init
pm = PathMerger('devices', 'device_models')

df_tgt = spark.read.format("delta").load(pm.bronze)

df_tgt.toPandas()

Unnamed: 0,dms_timestamp,id,release_date,name,color,description,created,modified,src_batch_id
0,2021-08-05 15:19:54,1,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01,
1,2021-08-05 15:19:54,2,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02,
2,2021-08-05 15:19:54,3,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00,
3,2021-08-05 15:19:54,4,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01,


## Generate Schema

In [15]:
# Init
schema = StructType()

# Add fields.
schema.add('Op', 'string')
schema.add('dms_timestamp', 'string')
schema.add('id', 'integer')
schema.add('release_date', 'date')
schema.add('name', 'string')
schema.add('color', 'string')
schema.add('description', 'string')
schema.add('created', 'timestamp')
schema.add('modified', 'timestamp')

StructType(List(StructField(Op,StringType,true),StructField(dms_timestamp,StringType,true),StructField(id,IntegerType,true),StructField(release_date,DateType,true),StructField(name,StringType,true),StructField(color,StringType,true),StructField(description,StringType,true),StructField(created,TimestampType,true),StructField(modified,TimestampType,true)))

## Generate DataFrame on INSERTS

In [16]:
def curr():
    now = datetime.now()
    return datetime(now.year, now.month, now.day, now.hour, now.minute, now.second)

def curr_str():
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S')

new_rows = [
    ('I', curr_str(), 5, date(2021, 8, 1), 'Super Gadget 300', 'Black', 'new device', curr(), curr()),
    ('I', curr_str(), 6, date(2021, 8, 1), 'Super Gadget 300', 'Pink', 'new device', curr(), curr()),
]

df_inserts = spark.createDataFrame(new_rows, schema)

In [17]:
df_inserts.show()

+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+
| Op|      dms_timestamp| id|release_date|            name|color|description|            created|           modified|
+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+
|  I|2021-08-19 10:46:24|  5|  2021-08-01|Super Gadget 300|Black| new device|2021-08-19 07:46:24|2021-08-19 07:46:24|
|  I|2021-08-19 10:46:24|  6|  2021-08-01|Super Gadget 300| Pink| new device|2021-08-19 07:46:24|2021-08-19 07:46:24|
+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+



## Generate DataFrame of UPDATES

In [18]:
def modify_row(idx, new_desc):
    new_row = ( 
        df_tgt.where(F.col('id') == idx)
        .withColumn('Op', F.lit('U'))
        .withColumn('dms_timestamp', F.lit(curr_str()))
        .withColumn('description', F.lit(new_desc))
        .withColumn('modified', F.lit(curr()))
        .drop('src_batch_id')
    ).first()
    
    return new_row

In [19]:
new_rows = [
    modify_row(1, 'update A'),
    modify_row(2, 'update B'),
]

df_updates = spark.createDataFrame(new_rows)

# Reorder to match the schema
df_updates = df_updates.select([x.name for x in schema])

In [20]:
df_updates.show()

+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+
| Op|      dms_timestamp| id|release_date|            name|color|description|            created|           modified|
+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+
|  U|2021-08-19 10:46:36|  1|  2010-05-15|Super Gadget 100|  Red|   update A|2010-03-21 12:00:01|2021-08-19 07:46:36|
|  U|2021-08-19 10:46:37|  2|  2010-05-15|Super Gadget 100|Black|   update B|2010-03-21 12:00:02|2021-08-19 07:46:37|
+---+-------------------+---+------------+----------------+-----+-----------+-------------------+-------------------+



## Union to a single DataFrame

In [21]:
df_all = df_inserts.union(df_updates).toPandas()

df_all

Unnamed: 0,Op,dms_timestamp,id,release_date,name,color,description,created,modified
0,I,2021-08-19 10:46:24,5,2021-08-01,Super Gadget 300,Black,new device,2021-08-19 07:46:24,2021-08-19 07:46:24
1,I,2021-08-19 10:46:24,6,2021-08-01,Super Gadget 300,Pink,new device,2021-08-19 07:46:24,2021-08-19 07:46:24
2,U,2021-08-19 10:46:36,1,2010-05-15,Super Gadget 100,Red,update A,2010-03-21 12:00:01,2021-08-19 07:46:36
3,U,2021-08-19 10:46:37,2,2010-05-15,Super Gadget 100,Black,update B,2010-03-21 12:00:02,2021-08-19 07:46:37


## Write

In [22]:
now = datetime.now()

# DMS outputs to subdirs based on ./yyyy/mm/dd/
output_dir = os.path.join(pm.staging, str(now.year), str(now.month), str(now.day))

# File name follows are pattern similar to yyyymmdd_hhmmss.parquet
file_name = now.strftime('%Y%m%d_%H%M%S') + '.parquet'

# Combine
output_path = os.path.join(output_dir, file_name)

# Verbose
print('[INFO] Writing to:', output_path)

# Create directories that do not exist
os.makedirs(output_dir) 

# Write
df_all.to_parquet(output_path)

[INFO] Writing to: S3\staging\dms\abc\devices\device_models\2021\8\19\20210819_104713.parquet
