##### Environment Setup

In [1]:

import os
import sys
# Set JAVA env variable
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-11.0.26.4-hotspot"
# Set Hadoop environment variables 
os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['PATH'] = os.environ['HADOOP_HOME'] + r'\bin;' + os.environ['PATH']
# Set the Python executable path explicitly
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Get Pipeline Tools Path
sys.path.append(R'C:\GitHub\Tools\de')


##### Libraries

In [2]:
import time
import logging
from datetime import datetime

import numpy as np
import pandas as pd

from typing import Dict


from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql.types import (StructType, StructField, StringType, 
                            DoubleType, IntegerType, TimestampType, 
                            DateType)
from delta.tables import DeltaTable


In [3]:
from de_pipeline_tools import *

##### Spark Session

In [4]:
spark = initialize_local_spark_delta_lake("Financial Data Pipeline")

2025-04-09 09:34:30,844 - INFO - ---Spark session initialized with Delta Lake support---


##### Inputs/Outputs

In [5]:
data_dir = "../../data"

source_path     = f"{data_dir}/financial_transactions.csv"
bronze_dir      = f"{data_dir}/bronze/financial_osb"
silver_dir      = f"{data_dir}/silver/financial_osb"
gold_dir        = f"{data_dir}/gold/financial_osb"

# Create directories if they don't exist
for dir_path in [bronze_dir, silver_dir, gold_dir]:
    os.makedirs(dir_path, exist_ok=True)

source_path  = os.path.abspath(os.path.join(os.getcwd(), *source_path.split('/')))
bronze_path  = os.path.abspath(os.path.join(os.getcwd(), *bronze_dir.split('/')))
silver_path  = os.path.abspath(os.path.join(os.getcwd(), *silver_dir.split('/')))
gold_path    = os.path.abspath(os.path.join(os.getcwd(), *gold_dir.split('/')))


In [6]:
def get_schema():
    return StructType([
        StructField("transaction_id", StringType(), True),
        StructField("timestamp", StringType(), True), # need to process as StringType and convert to Timestamp
        StructField("customer_id", StringType(), True),
        StructField("account_number", StringType(), True),
        StructField("transaction_type", StringType(), True),
        StructField("amount", DoubleType(), True),
        StructField("currency", StringType(), True),
        StructField("balance_after", DoubleType(), True),
        StructField("status", StringType(), True),
        StructField("merchant", StringType(), True),
        StructField("category", StringType(), True),
        StructField("location", StringType(), True)
    ])

### Define Transformation Functions and Validation Rules

In [7]:
def bronze_transform(df:DataFrame) -> DataFrame:

    # Clean any non-timestamp characters first
    df = df.withColumn(
        "timestamp_cleaned", 
        F.regexp_replace(F.col("timestamp"), "[^0-9\\-: ]", "")
    )

    # Use try_cast to handle invalid timestamps gracefully by returning NULL
    df = df.withColumn(
        "timestamp",
        F.expr("try_cast(timestamp_cleaned as timestamp)")
    )

    # Rename to transaction_date
    df = df.withColumnRenamed('timestamp', 'transaction_timestamp')

    # Drop the intermediate column
    df = df.drop("timestamp_cleaned")

    return df

In [8]:
# Define bronze validation rules
bronze_validation_rules = [
    {
        "name": "has_transaction_id",
        "condition": "transaction_id IS NOT NULL",
        "description": "Transaction ID must be present"
    },
    {
        "name": "valid_amount",
        "condition": "amount IS NULL OR amount > 0",
        "description": "Amount must be positive if not null"
    },
    {
        "name": "valid_transaction_timestamp",
        "condition": "transaction_timestamp IS NULL OR transaction_timestamp <= current_date()",
        "description": "Transaction Timestamp must not be in the future"
    }
]

In [9]:
def silver_transform(df:DataFrame) -> DataFrame:

    # Remove duplicates
    df = df.dropDuplicates(subset=["transaction_id"])

    # Standardize Data
    df = (df
            .withColumn("amount", F.abs(F.col("amount")))
            .withColumn("transaction_type", F.lower(F.col("transaction_type")))
            .withColumn("category", F.lower(F.col("category")))
            .withColumn("status", F.lower(F.col("status")))
    )

    # Filter Data
    # Address bronze layer data validation check concerns
    df = df.filter(
                    (F.col('transaction_id').isNotNull()) # transaction id must exist
                    & (F.col('amount') > 0) # amount must be positive
                    & ((F.col('transaction_timestamp') <= F.current_date()) # must be <= current date
                    |(F.col('transaction_timestamp').isNull()))# or must be Null, no future timestamps
    ) 
    
    # Split timestamp into date and time
    df = (df
            .withColumn("transaction_date", F.to_date("transaction_timestamp"))
            .withColumn("transaction_time", F.date_format("transaction_timestamp", "HH:mm:ss"))
    )
    
    # Derive year_month for partitioning
    df = (df
            .withColumn("year_month", F.date_format(F.col("transaction_date"), "yyyy-MM"))
            .withColumn("processing_timestamp", F.current_timestamp())
    )

    return df

In [10]:
# Define silver validation rules
silver_validation_rules = [
    {
        "name": "valid_transaction_type",
        "condition": "transaction_type IN ('debit', 'credit', 'transfer', 'payment', 'withdrawal', 'deposit') OR transaction_type IS NULL",
        "description": "Transaction type must be one of the valid types"
    },
    {
        "name": "valid_status",
        "condition": "status IN ('completed', 'pending', 'failed', 'cancelled', 'refunded') OR status IS NULL",
        "description": "Status must be one of the valid statuses"
    },
    {
        "name": "valid_currency",
        "condition": "currency IS NULL OR length(currency) = 3",
        "description": "Currency code should be 3 characters if present"
    }
]

In [11]:
def gold_transform(df:DataFrame) -> Dict:
    gold_dfs = {}

    # Gold aggregation 1: Daily summary by category
    daily_category = (df
        .groupBy("transaction_date", "category")
        .agg(
            F.count("transaction_id").alias("transaction_count"),
            F.sum("amount").alias("total_amount"),
            F.avg("amount").alias("avg_amount"),
            F.min("amount").alias("min_amount"),
            F.max("amount").alias("max_amount"),
            F.countDistinct("customer_id").alias("unique_customers")
        )
        .withColumn("processing_timestamp", F.current_timestamp())
    )
    
    gold_dfs["daily_category"] = daily_category
    
    # Gold aggregation 2: Customer summary
    customer_summary = (df
        .groupBy("customer_id")
        .agg(
            F.count("transaction_id").alias("transaction_count"),
            F.sum("amount").alias("total_amount"),
            F.avg("amount").alias("avg_amount"),
            F.min("transaction_date").alias("first_transaction_date"),
            F.max("transaction_date").alias("last_transaction_date"),
            F.approx_count_distinct("category").alias("category_count")
        )
        .withColumn("processing_timestamp", F.current_timestamp())
        .withColumn("days_since_last_transaction", 
                    F.datediff(F.current_date(), F.col("last_transaction_date")))
    )
    
    gold_dfs["customer_summary"] = customer_summary
    
    # Gold aggregation 3: Transaction type summary
    transaction_type_summary = (df
        .groupBy("transaction_type")
        .agg(
            F.count("transaction_id").alias("transaction_count"),
            F.sum("amount").alias("total_amount"),
            F.avg("amount").alias("avg_amount")
        )
        .withColumn("processing_timestamp", F.current_timestamp())
    )
    
    gold_dfs["transaction_type_summary"] = transaction_type_summary

    return gold_dfs

In [12]:
# Define gold validation rules
gold_validation_rules = [
    {
        "name": "positive_transaction_counts",
        "condition": "transaction_count > 0",
        "description": "Transaction counts should be positive"
    },
    {
        "name": "valid_total_amounts",
        "condition": "total_amount >= 0",
        "description": "Total amounts should not be negative"
    }
]

### Running Full Batch Pipeline

In [13]:
run_batch_de_pipeline(spark, source_path, get_schema(), 
                        bronze_path, silver_path, gold_path, 
                        bronze_transform, silver_transform, gold_transform, 
                        bronze_validation_rules, silver_validation_rules, gold_validation_rules,
                        pipeline_name='Financial_DE_Pipeline')

2025-04-09 09:34:41,620 - INFO - --Starting data pipeline execution with ID: Financial_DE_Pipeline_20250409_093441--
2025-04-09 09:34:41,621 - INFO - Starting bronze layer processing for c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-09 09:34:45,322 - INFO - Successfully read CSV data from c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-09 09:34:45,564 - INFO - Transformation function applied
2025-04-09 09:35:15,614 - INFO - Successfully wrote data to bronze layer at c:\GitHub\DE_Pipelines\data\bronze\financial_osb
2025-04-09 09:35:17,528 - INFO - Running data quality checks for bronze layer
2025-04-09 09:35:18,844 - INFO - Data Quality Metrics for bronze layer:
2025-04-09 09:35:18,846 - INFO -   - Column Count: 15
2025-04-09 09:35:18,847 - INFO -   - Columns: ['transaction_id', 'transaction_timestamp', 'customer_id', 'account_number', 'transaction_type', 'amount', 'currency', 'balance_after', 'status', 'merchant', 'category', 'location', 'ingestion_times

{'status': 'success',
 'pipeline_id': 'Financial_DE_Pipeline_20250409_093441',
 'bronze_version': 7,
 'silver_version': 13,
 'timestamp': '2025-04-09T09:37:21.294312',
 'duration_seconds': 159.6725251674652,
 'metrics': {'pipeline_id': 'Financial_DE_Pipeline_20250409_093441',
  'start_time': '2025-04-09T09:34:41.619794',
  'stages': {'bronze': {'duration_seconds': 38.567933559417725,
    'version': 7,
    'status': 'success'},
   'bronze_optimize': {'layer': 'bronze',
    'duration_seconds': 4.170166015625,
    'status': 'success'},
   'silver': {'duration_seconds': 25.652068614959717,
    'version': 13,
    'status': 'success',
    'source_bronze_version': 7},
   'silver_optimize': {'layer': 'silver',
    'duration_seconds': 15.585379362106323,
    'status': 'success'},
   'gold': {'duration_seconds': 69.2231125831604,
    'status': 'success',
    'source_silver_version': 13,
    'tables': ['daily_category',
     'customer_summary',
     'transaction_type_summary']},
   'gold_optimize

### Running Each Layer In Isolation

In [14]:
spark = initialize_local_spark_delta_lake("Financial Data Pipeline - Testing")

2025-04-09 09:37:22,276 - INFO - ---Spark session initialized with Delta Lake support---


#### Bronze

In [15]:
bronzedf, bronze_version = process_batch_bronze_layer(spark, source_path, get_schema(), bronze_path,
                               bronze_transform=bronze_transform, validation_rules=bronze_validation_rules,
                               pipeline_id='test', mode='test')

2025-04-09 09:37:23,724 - INFO - Starting bronze layer processing for c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-09 09:37:23,789 - INFO - Successfully read CSV data from c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-09 09:37:23,822 - INFO - Transformation function applied
2025-04-09 09:37:23,826 - INFO - Running data quality checks for bronze layer
2025-04-09 09:37:24,409 - INFO - Data Quality Metrics for bronze layer:
2025-04-09 09:37:24,413 - INFO -   - Column Count: 15
2025-04-09 09:37:24,415 - INFO -   - Columns: ['transaction_id', 'transaction_timestamp', 'customer_id', 'account_number', 'transaction_type', 'amount', 'currency', 'balance_after', 'status', 'merchant', 'category', 'location', 'ingestion_timestamp', 'source_file', 'batch_id']
2025-04-09 09:37:24,417 - INFO -   - Sample size: 169
2025-04-09 09:37:24,421 - INFO - Validating dataframe with 3 rules
2025-04-09 09:37:24,751 - INFO - Validation rule 'has_transaction_id' FAILED - 5 record

In [16]:
bronzedf.show(5)

+--------------+---------------------+-----------+--------------+----------------+-------+--------+-------------+---------+----------+---------+----------------+--------------------+--------------------+--------+
|transaction_id|transaction_timestamp|customer_id|account_number|transaction_type| amount|currency|balance_after|   status|  merchant| category|        location| ingestion_timestamp|         source_file|batch_id|
+--------------+---------------------+-----------+--------------+----------------+-------+--------+-------------+---------+----------+---------+----------------+--------------------+--------------------+--------+
|   TXN00000001|  2023-06-05 11:37:36| CUST001040| ACCT-86152351|          refund| 598.65|     USD|      2900.03| disputed|Mastercard|education|      Denver, CO|2025-04-09 09:37:...|file:///c:/GitHub...|    test|
|   TXN00000002|  2023-06-04 15:42:51|       NULL| ACCT-52201156|        transfer|-908.13|     USD|      5944.76|completed|      NULL|     NULL|   S

#### Silver

In [17]:
silverdf, silver_version = process_batch_silver_layer(spark, bronze_path, silver_path, 
                                                    silver_transform=silver_transform, validation_rules=silver_validation_rules,
                                                    pipeline_id='test', mode='test', bronze_version=None)

2025-04-09 09:37:25,387 - INFO - Starting silver layer processing
2025-04-09 09:37:25,879 - INFO - Successfully read bronze data version 7
2025-04-09 09:37:25,990 - INFO - Transformation function applied
2025-04-09 09:37:25,995 - INFO - Running data quality checks for silver layer
2025-04-09 09:37:33,042 - INFO - Data Quality Metrics for silver layer:
2025-04-09 09:37:33,043 - INFO -   - Column Count: 19
2025-04-09 09:37:33,045 - INFO -   - Columns: ['transaction_id', 'transaction_timestamp', 'customer_id', 'account_number', 'transaction_type', 'amount', 'currency', 'balance_after', 'status', 'merchant', 'category', 'location', 'ingestion_timestamp', 'source_file', 'batch_id', 'transaction_date', 'transaction_time', 'year_month', 'processing_timestamp']
2025-04-09 09:37:33,048 - INFO -   - Sample size: 135
2025-04-09 09:37:33,051 - INFO - Validating dataframe with 3 rules
2025-04-09 09:37:36,580 - INFO - Validation rule 'valid_transaction_type' FAILED - 56 records (41.48%) failed
2025-

In [18]:
silverdf.show(5)

+--------------+---------------------+-----------+--------------+----------------+-------+--------+-------------+---------+-----------+-------------+----------------+--------------------+--------------------+--------------------+----------------+----------------+----------+--------------------+
|transaction_id|transaction_timestamp|customer_id|account_number|transaction_type| amount|currency|balance_after|   status|   merchant|     category|        location| ingestion_timestamp|         source_file|            batch_id|transaction_date|transaction_time|year_month|processing_timestamp|
+--------------+---------------------+-----------+--------------+----------------+-------+--------+-------------+---------+-----------+-------------+----------------+--------------------+--------------------+--------------------+----------------+----------------+----------+--------------------+
|   TXN00000001|  2023-06-05 11:37:36| CUST001040| ACCT-86152351|          refund| 598.65|     USD|      2900.03

##### Investigating Silver Validation Check Failures:
Corrupt "status" column

In [19]:
silverdf.groupBy('status').count().show()

+---------+-----+
|   status|count|
+---------+-----+
|  pendlng|    1|
|completed|  264|
| dioputed|    1|
| dirputed|    1|
|completpd|    1|
|   failed|  285|
|     NULL|   13|
|completei|    1|
|  pjnding|    1|
| oeversed|    1|
|  sending|    1|
| disiuted|    1|
|  pendinz|    1|
| repersed|    1|
| disptted|    1|
|  pendiny|    1|
|complethd|    1|
|  pending|  269|
|   faihed|    1|
|  pedding|    1|
+---------+-----+
only showing top 20 rows



#### Gold

In [27]:
gold_dfs = process_batch_gold_layer(spark, silver_path, gold_path, gold_transform, 
                                    validation_rules=gold_validation_rules, pipeline_id='test', mode='test', silver_version=None)

2025-04-09 09:40:54,407 - INFO - Starting gold layer processing
2025-04-09 09:40:54,924 - INFO - Successfully read silver data version 14
2025-04-09 09:40:55,058 - INFO - Transformation function applied
2025-04-09 09:40:55,061 - INFO - Running data quality checks for gold_daily_category layer
2025-04-09 09:41:02,529 - INFO - Data Quality Metrics for gold_daily_category layer:
2025-04-09 09:41:02,531 - INFO -   - Column Count: 9
2025-04-09 09:41:02,533 - INFO -   - Columns: ['transaction_date', 'category', 'transaction_count', 'total_amount', 'avg_amount', 'min_amount', 'max_amount', 'unique_customers', 'processing_timestamp']
2025-04-09 09:41:02,537 - INFO -   - Sample size: 197
2025-04-09 09:41:02,541 - INFO - Validating dataframe with 2 rules
2025-04-09 09:41:09,266 - INFO - Validation rule 'positive_transaction_counts' PASSED - 0 records (0.00%) failed
2025-04-09 09:41:11,715 - INFO - Validation rule 'valid_total_amounts' PASSED - 0 records (0.00%) failed
2025-04-09 09:41:11,725 - I

In [22]:
for table_name in gold_dfs.keys():
    gold_dfs[table_name].show(5)

+----------------+--------+-----------------+------------------+-----------------+----------+----------+----------------+--------------------+
|transaction_date|category|transaction_count|      total_amount|       avg_amount|min_amount|max_amount|unique_customers|processing_timestamp|
+----------------+--------+-----------------+------------------+-----------------+----------+----------+----------------+--------------------+
|      2023-05-25|    fees|                1|             18.07|            18.07|     18.07|     18.07|               1|2025-04-09 09:38:...|
|      2023-12-05|  income|                1|           1818.85|          1818.85|   1818.85|   1818.85|               1|2025-04-09 09:38:...|
|      2023-10-01|    fees|                1|             32.44|            32.44|     32.44|     32.44|               1|2025-04-09 09:38:...|
|      2023-12-27|  dining|                1|           1307.27|          1307.27|   1307.27|   1307.27|               1|2025-04-09 09:38:...|