##### Local Environment Setup

In [1]:

import os
import sys
# Set JAVA env variable
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-11.0.26.4-hotspot"
# Set Hadoop environment variables 
os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['PATH'] = os.environ['HADOOP_HOME'] + r'\bin;' + os.environ['PATH']
# Set the Python executable path explicitly
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Get Pipeline Tools Path
sys.path.append(R'C:\GitHub\Tools\de')


##### Libraries

In [2]:
import time
import logging
from datetime import datetime

import numpy as np
import pandas as pd

from typing import Dict


from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql.types import (StructType, StructField, StringType, 
                            DoubleType, IntegerType, TimestampType, 
                            DateType)
from delta.tables import DeltaTable


In [3]:
from de_pipeline_tools import *

##### Spark Session

In [4]:
spark = initialize_local_spark_delta_lake("Financial Data Pipeline")

2025-04-18 22:07:40,489 - INFO - ---Spark session initialized with Delta Lake support---


In [5]:
# Create a database in the Hive warehouse if doesn't exist
spark.sql("CREATE DATABASE IF NOT EXISTS de_pipelines LOCATION 'C:/hive-warehouse/de_pipelines'")

DataFrame[]


## SCD Type 2:
##### Intial Creation (Overwrite)

##### Inputs/Outputs

In [6]:
# Intial Creation
data_dir = "../../data"
file_list = [
    "financial_transactions_20250409_113413.csv",
    "financial_transactions_20250410_113413.csv"
]

abs_file_list = []
for file in file_list:
    # Join data_dir with filename
    relative_path = os.path.join(data_dir, file)
    # Convert to absolute path
    abs_path = os.path.abspath(relative_path)
    abs_file_list.append(abs_path)

bronze_table = 'de_pipelines.financial_osb_bronze_type2'
silver_table = 'de_pipelines.financial_osb_silver_type2'
gold_table   = 'de_pipelines.financial_osb_gold_type2'


In [7]:
def get_schema():
    return StructType([
    StructField("transaction_id", StringType(), False),
    StructField("timestamp", StringType(), True), #load in as string type, handle in silver step     
    StructField("customer_id", StringType(), True),      
    StructField("account_number", StringType(), True),   
    StructField("transaction_type", StringType(), True), 
    StructField("amount", DoubleType(), True),           
    StructField("currency", StringType(), True),         
    StructField("balance_after", DoubleType(), True),    
    StructField("status", StringType(), True),           
    StructField("merchant", StringType(), True),         
    StructField("category", StringType(), True),         
    StructField("location", StringType(), True)          
])

#### Define: Validation Rules, Transformations, Write Executions

In [8]:
def bronze_transform(df:DataFrame) -> DataFrame:

    # Mark the most recent record as current
    # and the rest as historical
    w = Window.partitionBy("transaction_id", "account_number").orderBy(F.col("timestamp").desc())

    # define SCD Type 2 necessary inputs
    df = df.withColumn("row_num", F.row_number().over(w)) \
           .withColumn("is_current", F.when(F.col("row_num") == 1, 1).otherwise(0)) \
           .withColumn("start_date", F.current_date()) \
           .withColumn("end_date", F.when(F.col("is_current") == 0, F.current_date()).otherwise(F.lit(None).cast("timestamp"))) \
           .drop("row_num")
           
    return df

In [9]:
def bronze_writer(df: DataFrame, table_name: str):
    
    (df.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(table_name)
    )
    
    return None

In [10]:
# Define bronze validation rules
bronze_validation_rules = [
    {
        "name": "has_transaction_id",
        "condition": "transaction_id IS NOT NULL",
        "description": "Transaction ID must be present"
    },
    {
        "name": "valid_amount",
        "condition": "amount IS NOT NULL AND amount > 0",
        "description": "Amount must be positive if not null"
    },
    {
        "name": "valid_timestamp",
        "condition": "timestamp IS NOT NULL AND timestamp <= current_timestamp()",
        "description": "Timestamp must not be in the future"
    }
]

In [11]:
def silver_transform(df:DataFrame) -> DataFrame:

    # Clean any non-timestamp characters first
    df = df.withColumn(
        "timestamp", 
        F.regexp_replace(F.col("timestamp"), "[^0-9\\-: ]", "")
    )

    # Cast to timestamp type
    df = df.withColumn("timestamp", F.col("timestamp").cast("timestamp"))

    # Remove duplicates
    df = df.dropDuplicates()

    # Standardize Data
    df = (df
            .withColumn("amount", F.abs(F.col("amount")))
            .withColumn("transaction_type", F.lower(F.col("transaction_type")))
            .withColumn("category", F.lower(F.col("category")))
            .withColumn("status", F.lower(F.col("status")))
    )

    # Filter Data
    # Address bronze layer data validation check concerns
    df = df.filter(
                    (F.col('transaction_id').isNotNull()) # transaction id must exist
                    & (F.col('account_number').isNotNull()) # account number must exist
                    & (F.col('amount') > 0) # amount must be positive
                    & ((F.col('timestamp') <= F.current_date()) # must be <= current date
                    |(F.col('timestamp').isNull()))# or must be Null, no future timestamps
    ) 
    
    # Split timestamp into date and time and year_month for paritioning
    df = (df
            .withColumn("transaction_date", F.to_date("timestamp"))
            .withColumn("transaction_time", F.date_format("timestamp", "HH:mm:ss"))
            .withColumn("year_month", F.date_format(F.col("transaction_date"), "yyyy-MM"))
    )
    
    # Add processing timestamp for bookkeeping
    df = (df
            .withColumn("processing_timestamp", F.current_timestamp())
    )

    return df

In [12]:
def silver_writer(df: DataFrame, table_name: str):
    (df.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .partitionBy("year_month")
        .saveAsTable(table_name)
    )

    return None

In [13]:
# Define silver validation rules
silver_validation_rules = [
    {
        "name": "valid_transaction_type",
        "condition": "transaction_type IN ('debit', 'credit', 'transfer', 'payment', 'withdrawal', 'deposit') OR transaction_type IS NULL",
        "description": "Transaction type must be one of the valid types"
    },
    {
        "name": "valid_status",
        "condition": "status IN ('completed', 'pending', 'failed', 'cancelled', 'refunded') OR status IS NULL",
        "description": "Status must be one of the valid statuses"
    },
    {
        "name": "valid_currency",
        "condition": "currency IS NOT NULL AND length(currency) = 3",
        "description": "Currency code should be 3 characters if present"
    },
    {
        "name": "valid_timestamp",
        "condition": "timestamp IS NOT NULL AND timestamp <= current_timestamp()",
        "description": "Timestamp must not be NULL or in the future"
    }
]

In [14]:
def gold_transform(df:DataFrame) -> Dict:
    gold_dfs = {}

    # Gold aggregation 1: Daily summary by category
    daily_category = (df
        .groupBy("transaction_date", "category")
        .agg(
            F.count("transaction_id").alias("transaction_count"),
            F.sum("amount").alias("total_amount"),
            F.avg("amount").alias("avg_amount"),
            F.min("amount").alias("min_amount"),
            F.max("amount").alias("max_amount"),
            F.countDistinct("customer_id").alias("unique_customers")
        )
        .withColumn("processing_timestamp", F.current_timestamp())
    )
    
    gold_dfs["daily_category"] = daily_category
    
    # Gold aggregation 2: Customer summary
    customer_summary = (df
        .groupBy("customer_id")
        .agg(
            F.count("transaction_id").alias("transaction_count"),
            F.sum("amount").alias("total_amount"),
            F.avg("amount").alias("avg_amount"),
            F.min("transaction_date").alias("first_transaction_date"),
            F.max("transaction_date").alias("last_transaction_date"),
            F.approx_count_distinct("category").alias("category_count")
        )
        .withColumn("processing_timestamp", F.current_timestamp())
        .withColumn("days_since_last_transaction", 
                    F.datediff(F.current_date(), F.col("last_transaction_date")))
    )
    
    gold_dfs["customer_summary"] = customer_summary
    
    # Gold aggregation 3: Transaction type summary
    transaction_type_summary = (df
        .groupBy("transaction_type")
        .agg(
            F.count("transaction_id").alias("transaction_count"),
            F.sum("amount").alias("total_amount"),
            F.avg("amount").alias("avg_amount")
        )
        .withColumn("processing_timestamp", F.current_timestamp())
    )
    
    gold_dfs["transaction_type_summary"] = transaction_type_summary

    return gold_dfs

In [15]:
def gold_writer(df: DataFrame, table_name: str):
    (df.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(table_name)
    )

    return None

In [16]:
# Define gold validation rules
gold_validation_rules = [
    {
        "name": "positive_transaction_counts",
        "condition": "transaction_count > 0",
        "description": "Transaction counts should be positive"
    },
    {
        "name": "valid_total_amounts",
        "condition": "total_amount >= 0",
        "description": "Total amounts should not be negative"
    }
]

### Running Full Batch Pipeline

In [17]:
run_batch_de_pipeline(spark, 'csv', abs_file_list, get_schema(), 
                     bronze_table=bronze_table, silver_table=silver_table, gold_table=gold_table, 
                     bronze_transform=bronze_transform, silver_transform=silver_transform, gold_transform=gold_transform,
                     bronze_writer=bronze_writer, silver_writer=silver_writer, gold_writer=gold_writer,
                     bronze_validation_rules=bronze_validation_rules, 
                     silver_validation_rules=silver_validation_rules, 
                     gold_validation_rules=gold_validation_rules,
                     pipeline_name='Financial_DE_Pipeline')

2025-04-18 22:08:01,840 - INFO - --Starting data pipeline execution with ID: Financial_DE_Pipeline_20250418_220801--
2025-04-18 22:08:01,842 - INFO - Starting bronze layer processing
2025-04-18 22:08:02,620 - INFO - Successfully read CSV data from: 
  - c:\GitHub\DE_Pipelines\data\financial_transactions_20250409_113413.csv
  - c:\GitHub\DE_Pipelines\data\financial_transactions_20250410_113413.csv
2025-04-18 22:08:02,972 - INFO - Transformation function applied
2025-04-18 22:08:02,974 - INFO - Writing to bronze table: de_pipelines.financial_osb_bronze_type2
2025-04-18 22:08:37,856 - INFO - Successfully wrote to bronze table: de_pipelines.financial_osb_bronze_type2
2025-04-18 22:08:37,858 - INFO - Write Metrics: 
[
  {
    "numOutputRows": "1000",
    "numOutputBytes": "43252",
    "numFiles": "1"
  }
]
2025-04-18 22:08:38,858 - INFO - Running data quality checks for bronze layer
2025-04-18 22:08:49,778 - INFO - Data Quality Metrics for bronze layer:
2025-04-18 22:08:49,780 - INFO -   - 

{'status': 'success',
 'pipeline_id': 'Financial_DE_Pipeline_20250418_220801',
 'bronze_version': 96,
 'silver_version': 62,
 'timestamp': '2025-04-18T22:11:14.186954',
 'duration_seconds': 192.3431088924408,
 'metrics': {'pipeline_id': 'Financial_DE_Pipeline_20250418_220801',
  'start_time': '2025-04-18T22:08:01.840845',
  'stages': {'bronze': {'duration_seconds': 58.844189405441284,
    'version': 96,
    'status': 'success'},
   'bronze_optimize': {'layer': 'bronze',
    'duration_seconds': 4.1228249073028564,
    'status': 'success'},
   'silver': {'duration_seconds': 35.54041361808777,
    'version': 62,
    'status': 'success',
    'source_bronze_version': 96},
   'silver_optimize': {'layer': 'silver',
    'duration_seconds': 11.970736742019653,
    'status': 'success'},
   'gold': {'duration_seconds': 76.18854546546936,
    'status': 'success',
    'source_silver_version': 62,
    'tables': ['daily_category',
     'customer_summary',
     'transaction_type_summary']},
   'gold_o

### Running Layers In Isolation

In [18]:
spark = initialize_local_spark_delta_lake("Financial Data Pipeline - Testing")

2025-04-18 22:11:15,706 - INFO - ---Spark session initialized with Delta Lake support---


#### Bronze

In [19]:
bronzedf, bronze_version = process_batch_bronze_layer(spark, 'csv', abs_file_list, get_schema(), bronze_table,
                               bronze_transform=bronze_transform, validation_rules=bronze_validation_rules,
                               pipeline_id='test', mode='test', bronze_writer=None)

2025-04-18 22:11:17,189 - INFO - Starting bronze layer processing
2025-04-18 22:11:17,254 - INFO - Successfully read CSV data from: 
  - c:\GitHub\DE_Pipelines\data\financial_transactions_20250409_113413.csv
  - c:\GitHub\DE_Pipelines\data\financial_transactions_20250410_113413.csv
2025-04-18 22:11:17,323 - INFO - Transformation function applied
2025-04-18 22:11:17,327 - INFO - Running data quality checks for bronze layer
2025-04-18 22:11:24,347 - INFO - Data Quality Metrics for bronze layer:
2025-04-18 22:11:24,349 - INFO -   - Shape: [18,1000] (approx. row count)
2025-04-18 22:11:24,351 - INFO -   - Schema: 
root
 |-- transaction_id: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- account_number: string (nullable = true)
 |-- transaction_type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- currency: string (nullable = true)
 |-- balance_after: double (nullable = true)
 |-- status: string (nullable 

In [20]:
bronzedf.show(5)

+--------------+-------------------+-----------+--------------+----------------+-------+--------+-------------+---------+--------+--------+-----------------+--------------------+--------------------+--------+----------+----------+--------+
|transaction_id|          timestamp|customer_id|account_number|transaction_type| amount|currency|balance_after|   status|merchant|category|         location| ingestion_timestamp|         source_file|batch_id|is_current|start_date|end_date|
+--------------+-------------------+-----------+--------------+----------------+-------+--------+-------------+---------+--------+--------+-----------------+--------------------+--------------------+--------+----------+----------+--------+
|          NULL|2023-09-11 10:07:08| CUST001016| ACCT-19335534|          refund|1248.85|     CAD|     12133.35|completed|   Zelle| savings|       Denver, CO|2025-04-18 22:11:...|file:///c:/GitHub...|    test|         1|2025-04-18|    NULL|
|          NULL|2023-08-15 05:23:54| CUS

#### Silver

In [21]:
silverdf, silver_version = process_batch_silver_layer(spark, bronze_table, bronze_version=None, 
                                                   silver_table=silver_table, 
                                                   silver_transform=silver_transform, 
                                                   validation_rules=silver_validation_rules,
                                                   pipeline_id='test', mode='test', 
                                                   silver_writer=None)

2025-04-18 22:11:32,199 - INFO - Starting silver layer processing
2025-04-18 22:11:33,783 - INFO - Successfully read bronze data version 96
2025-04-18 22:11:33,912 - INFO - Transformation function applied
2025-04-18 22:11:33,916 - INFO - Running data quality checks for silver layer
2025-04-18 22:11:44,726 - INFO - Data Quality Metrics for silver layer:
2025-04-18 22:11:44,728 - INFO -   - Shape: [22,950] (approx. row count)
2025-04-18 22:11:44,731 - INFO -   - Schema: 
root
 |-- transaction_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- account_number: string (nullable = true)
 |-- transaction_type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- currency: string (nullable = true)
 |-- balance_after: double (nullable = true)
 |-- status: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- location: string (nullable = true)
 |-- ingestio

In [22]:
silverdf.show(5)

+--------------+-------------------+-----------+--------------+----------------+------+--------+-------------+--------+--------------+---------+-----------------+--------------------+--------------------+--------------------+----------+----------+--------+----------------+----------------+----------+--------------------+
|transaction_id|          timestamp|customer_id|account_number|transaction_type|amount|currency|balance_after|  status|      merchant| category|         location| ingestion_timestamp|         source_file|            batch_id|is_current|start_date|end_date|transaction_date|transaction_time|year_month|processing_timestamp|
+--------------+-------------------+-----------+--------------+----------------+------+--------+-------------+--------+--------------+---------+-----------------+--------------------+--------------------+--------------------+----------+----------+--------+----------------+----------------+----------+--------------------+
|   TXN00000166|2023-06-23 20:0

##### Investigating Silver Validation Check Failures:
Corrupt "status" column

In [23]:
silverdf.groupBy('status').count().show()

+---------+-----+
|   status|count|
+---------+-----+
| dqsputed|    1|
|  pgnding|    1|
|completed|  195|
|   failen|    1|
|   failed|  179|
|bompleted|    1|
|     NULL|   17|
|comcleted|    1|
| disputad|    1|
| dmsputed|    1|
| diiputed|    1|
|  pending|  164|
| disputed|  190|
| reverszd|    1|
| reversed|  196|
+---------+-----+



#### Gold

In [24]:
gold_dfs = process_batch_gold_layer(spark, silver_table, silver_version=None, 
                                 gold_table=gold_table, 
                                 gold_transform=gold_transform, 
                                 validation_rules=gold_validation_rules, 
                                 pipeline_id='test', 
                                 mode='test',
                                 gold_writer=None)

2025-04-18 22:11:56,474 - INFO - Starting gold layer processing
2025-04-18 22:11:57,276 - INFO - Successfully read silver data version 63
2025-04-18 22:11:57,434 - INFO - Transformation function applied
2025-04-18 22:11:57,437 - INFO - Running data quality checks for de_pipelines.financial_osb_gold_type2_daily_category layer
2025-04-18 22:12:08,562 - INFO - Data Quality Metrics for de_pipelines.financial_osb_gold_type2_daily_category layer:
2025-04-18 22:12:08,564 - INFO -   - Shape: [9,702] (approx. row count)
2025-04-18 22:12:08,565 - INFO -   - Schema: 
root
 |-- transaction_date: date (nullable = true)
 |-- category: string (nullable = true)
 |-- transaction_count: long (nullable = false)
 |-- total_amount: double (nullable = true)
 |-- avg_amount: double (nullable = true)
 |-- min_amount: double (nullable = true)
 |-- max_amount: double (nullable = true)
 |-- unique_customers: long (nullable = false)
 |-- processing_timestamp: timestamp (nullable = false)

2025-04-18 22:12:08,566 

In [25]:
for table_name in gold_dfs.keys():
    gold_dfs[table_name].show(5)

+----------------+----------+-----------------+------------+----------+----------+----------+----------------+--------------------+
|transaction_date|  category|transaction_count|total_amount|avg_amount|min_amount|max_amount|unique_customers|processing_timestamp|
+----------------+----------+-----------------+------------+----------+----------+----------+----------------+--------------------+
|      2023-05-25|      fees|                1|       30.89|     30.89|     30.89|     30.89|               1|2025-04-18 22:12:...|
|      2023-09-25|    income|                1|      1384.1|    1384.1|    1384.1|    1384.1|               1|2025-04-18 22:12:...|
|      2023-06-22|      NULL|                1|       64.69|     64.69|     64.69|     64.69|               1|2025-04-18 22:12:...|
|      2023-12-18|investment|                1|      626.53|    626.53|    626.53|    626.53|               1|2025-04-18 22:12:...|
|      2023-08-28|   housing|                1|       37.49|     37.49|     


## SCD Type 2: 
##### Change Data Capture (Merge)

##### Inputs/Outputs

In [26]:
# Change Data Capture
cdc_file_list = [
    "financial_transactions_20250410_113413_update.csv",
    "financial_transactions_20250414_113413.csv",
    "financial_transactions_20250417_113413.csv"
]

cdc_abs_file_list = []
for file in cdc_file_list:
    # Join data_dir with filename
    relative_path = os.path.join(data_dir, file)
    # Convert to absolute path
    abs_path = os.path.abspath(relative_path)
    cdc_abs_file_list.append(abs_path)


#### Define: Validation Rules(Same), Transformations(Same), Write Executions (Merge)

In [None]:
def cdc_bronze_writer(df: DataFrame, table_name: str):
    # Split into latest and historical records
    df = df.withColumn("row_num", F.row_number().over(
        Window.partitionBy("transaction_id", "account_number")
        .orderBy(F.col("timestamp").desc())
        )
    )
    archivedf = df.filter(F.col("row_num") != 1).drop("row_num")
    df = df.filter(F.col("row_num") == 1).drop("row_num")

    df.createOrReplaceTempView("source")

    # Update existing records
    spark.sql(f"""
        MERGE INTO {table_name} t
        USING source s
        ON t.transaction_id = s.transaction_id
            AND t.account_number = s.account_number
            AND t.is_current=1
            AND t.balance_after != s.balance_after
        WHEN MATCHED THEN
        UPDATE SET
            t.is_current = 0,
            t.end_date = CURRENT_DATE()
    """)

    # Insert new records
    spark.sql(f"""
        MERGE INTO {table_name} t
        USING (
            SELECT
                *,
                1 AS is_current,
                CURRENT_DATE() AS start_date,
                CAST(NULL AS TIMESTAMP) AS end_date
            FROM source
        ) s
        ON s.transaction_id = t.transaction_id 
            AND s.account_number = t.account_number 
            AND t.is_current=1
        WHEN NOT MATCHED THEN
            INSERT *
    """)

    # Handle historical records
    if not archivedf.rdd.isEmpty():
        archivedf.createOrReplaceTempView("archive")

        # Insert Historical Records using MERGE
        spark.sql(f"""
            MERGE INTO {table_name} t
            USING (
                SELECT
                    a.*,
                    0 AS is_current,
                    CURRENT_DATE() AS start_date,
                    CURRENT_DATE() AS end_date
                FROM archive a
                INNER JOIN {table_name} t2
                ON a.transaction_id = t2.transaction_id 
                    AND a.account_number = t2.account_number
                WHERE t2.is_current = 1
                AND NOT EXISTS (
                    SELECT 1 FROM {table_name} t3
                    WHERE t3.transaction_id = a.transaction_id
                    AND t3.account_number = a.account_number
                    AND t3.balance_after = a.balance_after
                    AND t3.is_current = 0
                )
            ) AS matched_records
            ON 1=0  -- Forces all records to INSERT (if there are any)
            WHEN NOT MATCHED THEN
                INSERT *
        """)

    return None

In [28]:
def cdc_silver_writer(df: DataFrame, table_name: str):

    df.createOrReplaceTempView("source")

    # Update Existing Records
    spark.sql(f"""
        MERGE INTO {table_name} t
        USING source s
        ON t.transaction_id = s.transaction_id
            AND t.account_number = s.account_number
            AND t.is_current=1
            AND s.is_current=1
            AND t.balance_after != s.balance_after
        WHEN MATCHED THEN
        UPDATE SET
            t.is_current = 0,
            t.end_date = CURRENT_DATE()

    """)

    # Insert New Records
    spark.sql(f"""
        MERGE INTO {table_name} t
        USING source s
        ON s.transaction_id = t.transaction_id 
            AND s.account_number = t.account_number 
            AND t.is_current=1
        WHEN NOT MATCHED AND s.is_current=1 THEN
            INSERT *
    """)

    # Insert Historical Records
    spark.sql(f"""
        MERGE INTO {table_name} t
        USING (
            SELECT s.*
            FROM source s
            INNER JOIN {table_name} t2
            ON s.transaction_id = t2.transaction_id 
                AND s.account_number = t2.account_number
            WHERE t2.is_current = 1
                AND s.is_current = 0
                AND NOT EXISTS (
                    SELECT 1 FROM {table_name} t3
                    WHERE t3.transaction_id = s.transaction_id
                    AND t3.account_number = s.account_number
                    AND t3.balance_after = s.balance_after
                    AND t3.is_current = 0
                )
        ) AS matched_records
        ON 1=0  -- This ensures all records go to the INSERT clause -- always false
        WHEN NOT MATCHED THEN
        INSERT *
    """)

    return None

#### Running CDC Full Batch Pipeline (Bronze, Silver)

In [29]:
spark = initialize_local_spark_delta_lake("Financial Data Pipeline_CDC")

2025-04-18 22:12:48,395 - INFO - ---Spark session initialized with Delta Lake support---


In [30]:
run_batch_de_pipeline(spark, 'csv', cdc_abs_file_list, get_schema(), 
                     bronze_table=bronze_table, silver_table=silver_table, gold_table=None, 
                     bronze_transform=None, silver_transform=silver_transform, gold_transform=None,
                     bronze_writer=cdc_bronze_writer, silver_writer=cdc_silver_writer, gold_writer=None,
                     bronze_validation_rules=bronze_validation_rules, 
                     silver_validation_rules=silver_validation_rules, 
                     gold_validation_rules=None,
                     pipeline_name='Financial_DE_Pipeline_CDC')

2025-04-18 22:12:48,948 - INFO - --Starting data pipeline execution with ID: Financial_DE_Pipeline_CDC_20250418_221248--
2025-04-18 22:12:48,950 - INFO - Starting bronze layer processing
2025-04-18 22:12:49,009 - INFO - Successfully read CSV data from: 
  - c:\GitHub\DE_Pipelines\data\financial_transactions_20250410_113413_update.csv
  - c:\GitHub\DE_Pipelines\data\financial_transactions_20250414_113413.csv
  - c:\GitHub\DE_Pipelines\data\financial_transactions_20250417_113413.csv
2025-04-18 22:12:49,011 - INFO - No transformation function defined
2025-04-18 22:12:49,013 - INFO - Writing to bronze table: de_pipelines.financial_osb_bronze_type2
2025-04-18 22:13:14,314 - INFO - Successfully wrote to bronze table: de_pipelines.financial_osb_bronze_type2
2025-04-18 22:13:14,316 - INFO - Write Metrics: 
[
  {
    "numOutputRows": "1000",
    "numTargetBytesAdded": "43684",
    "numTargetRowsInserted": "0",
    "numTargetFilesAdded": "1",
    "materializeSourceTimeMs": "898",
    "numTargetR

{'status': 'success',
 'pipeline_id': 'Financial_DE_Pipeline_CDC_20250418_221248',
 'bronze_version': 99,
 'silver_version': 66,
 'timestamp': '2025-04-18T22:14:19.866135',
 'duration_seconds': 90.91651940345764,
 'metrics': {'pipeline_id': 'Financial_DE_Pipeline_CDC_20250418_221248',
  'start_time': '2025-04-18T22:12:48.948615',
  'stages': {'bronze': {'duration_seconds': 32.09622550010681,
    'version': 99,
    'status': 'success'},
   'bronze_optimize': {'layer': 'bronze',
    'duration_seconds': 11.698723554611206,
    'status': 'success'},
   'silver': {'duration_seconds': 34.78598713874817,
    'version': 66,
    'status': 'success',
    'source_bronze_version': 99},
   'silver_optimize': {'layer': 'silver',
    'duration_seconds': 12.329587697982788,
    'status': 'success'}},
  'status': 'success',
  'end_time': '2025-04-18T22:14:19.865135',
  'total_duration_seconds': 90.91651940345764}}

#### CDC Validation

In [31]:
spark = initialize_local_spark_delta_lake("Financial Data Pipeline - Testing")

2025-04-18 22:14:21,420 - INFO - ---Spark session initialized with Delta Lake support---


In [32]:
spark.read.table(bronze_table).filter(F.col("transaction_id") == "TXN00000501").show()

+--------------+-------------------+-----------+--------------+----------------+-------+--------+-------------+--------+--------+--------+------------+--------------------+--------------------+--------------------+----------+----------+-------------------+
|transaction_id|          timestamp|customer_id|account_number|transaction_type| amount|currency|balance_after|  status|merchant|category|    location| ingestion_timestamp|         source_file|            batch_id|is_current|start_date|           end_date|
+--------------+-------------------+-----------+--------------+----------------+-------+--------+-------------+--------+--------+--------+------------+--------------------+--------------------+--------------------+----------+----------+-------------------+
|   TXN00000501|2023-01-24 22:00:49| CUST001103| ACCT-56020613|             fee| -89.25|     USD|      4999.99|disputed|   Chase|    fees|New York, NY|2025-04-18 22:13:...|file:///c:/GitHub...|Financial_DE_Pipe...|         0|2025

In [33]:
spark.read.table(silver_table).filter(F.col("transaction_id") == "TXN00000501").show()

+--------------+-------------------+-----------+--------------+----------------+------+--------+-------------+--------+--------+--------+------------+--------------------+--------------------+--------------------+----------+----------+-------------------+----------------+----------------+----------+--------------------+
|transaction_id|          timestamp|customer_id|account_number|transaction_type|amount|currency|balance_after|  status|merchant|category|    location| ingestion_timestamp|         source_file|            batch_id|is_current|start_date|           end_date|transaction_date|transaction_time|year_month|processing_timestamp|
+--------------+-------------------+-----------+--------------+----------------+------+--------+-------------+--------+--------+--------+------------+--------------------+--------------------+--------------------+----------+----------+-------------------+----------------+----------------+----------+--------------------+
|   TXN00000501|2023-01-24 22:00:4

In [None]:
#TODO: Get working with timestamp processing
# document up