##### Environment Setup

In [1]:

import os
import sys
# Set JAVA env variable
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-11.0.26.4-hotspot"
# Set Hadoop environment variables 
os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['PATH'] = os.environ['HADOOP_HOME'] + r'\bin;' + os.environ['PATH']
# Set the Python executable path explicitly
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Get Pipeline Tools Path
sys.path.append(R'C:\GitHub\Tools\de')


##### Libraries

In [2]:
import numpy as np
import pandas as pd


from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (StructType, StructField, StringType, 
                            DoubleType, IntegerType, TimestampType, 
                            DateType)
from delta.tables import DeltaTable
import time
import logging
import os
from datetime import datetime

In [3]:
from de_pipeline_tools import *

##### Spark Session

In [4]:
spark = initialize_spark_delta_lake("Financial Data Pipeline")

2025-04-08 14:29:10,266 - INFO - ---Spark session initialized with Delta Lake support---


##### Inputs/Outputs

In [5]:
data_dir = "../../data"

source_path     = f"{data_dir}/financial_transactions.csv"
bronze_dir      = f"{data_dir}/bronze/financial_osb"
silver_dir      = f"{data_dir}/silver/financial_osb"
gold_dir        = f"{data_dir}/gold/financial_osb"

# Create directories if they don't exist
for dir_path in [bronze_dir, silver_dir, gold_dir]:
    os.makedirs(dir_path, exist_ok=True)

source_path  = os.path.abspath(os.path.join(os.getcwd(), *source_path.split('/')))
bronze_path  = os.path.abspath(os.path.join(os.getcwd(), *bronze_dir.split('/')))
silver_path  = os.path.abspath(os.path.join(os.getcwd(), *silver_dir.split('/')))
gold_path    = os.path.abspath(os.path.join(os.getcwd(), *gold_dir.split('/')))


In [6]:
def get_schema():
    return StructType([
        StructField("transaction_id", StringType(), True),
        StructField("timestamp", StringType(), True), # need to process as StringType and convert to Timestamp
        StructField("customer_id", StringType(), True),
        StructField("account_number", StringType(), True),
        StructField("transaction_type", StringType(), True),
        StructField("amount", DoubleType(), True),
        StructField("currency", StringType(), True),
        StructField("balance_after", DoubleType(), True),
        StructField("status", StringType(), True),
        StructField("merchant", StringType(), True),
        StructField("category", StringType(), True),
        StructField("location", StringType(), True)
    ])

### Running Full Batch Pipeline

In [7]:
run_batch_de_pipeline(spark, source_path, bronze_path, get_schema(), silver_path, gold_path, pipeline_name='Financial_Data_Pipeline')

2025-04-08 14:29:20,924 - INFO - --Starting data pipeline execution with ID: Financial_Data_Pipeline_20250408_142920--
2025-04-08 14:29:20,925 - INFO - Starting bronze layer processing for c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-08 14:29:24,761 - INFO - Successfully read CSV data from c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-08 14:29:51,663 - INFO - Successfully wrote data to bronze layer at c:\GitHub\DE_Pipelines\data\bronze\financial_osb
2025-04-08 14:29:53,360 - INFO - Running data quality checks for bronze layer
2025-04-08 14:29:54,607 - INFO - Data Quality Metrics for bronze layer:
2025-04-08 14:29:54,609 - INFO -   - Column Count: 15
2025-04-08 14:29:54,610 - INFO -   - Columns: ['transaction_id', 'transaction_timestamp', 'customer_id', 'account_number', 'transaction_type', 'amount', 'currency', 'balance_after', 'status', 'merchant', 'category', 'location', 'ingestion_timestamp', 'source_file', 'batch_id']
2025-04-08 14:29:54,611 - INF

{'status': 'success',
 'pipeline_id': 'Financial_Data_Pipeline_20250408_142920',
 'bronze_version': 6,
 'silver_version': 11,
 'timestamp': '2025-04-08T14:32:05.554673',
 'duration_seconds': 164.62776947021484,
 'metrics': {'pipeline_id': 'Financial_Data_Pipeline_20250408_142920',
  'start_time': '2025-04-08T14:29:20.924009',
  'stages': {'bronze': {'duration_seconds': 35.17924213409424,
    'version': 6,
    'status': 'success'},
   'bronze_optimize': {'layer': 'bronze',
    'duration_seconds': 3.6484827995300293,
    'status': 'success'},
   'silver': {'duration_seconds': 24.164745569229126,
    'version': 11,
    'status': 'success',
    'source_bronze_version': 6},
   'silver_optimize': {'layer': 'silver',
    'duration_seconds': 12.790331602096558,
    'status': 'success'},
   'gold': {'duration_seconds': 82.59152293205261,
    'status': 'success',
    'source_silver_version': 11,
    'tables': ['daily_category',
     'customer_summary',
     'transaction_type_summary']},
   'gold

### Running Silver in Test Mode
##### Simulating: Investigating Validation Check Failures

In [8]:
spark = initialize_spark_delta_lake("Financial Data Pipeline - Testing")

2025-04-08 14:32:06,821 - INFO - ---Spark session initialized with Delta Lake support---


In [9]:
silverdf, silver_version = process_batch_silver_layer(
            spark, bronze_path, silver_path, 'test'
        )

2025-04-08 14:32:08,873 - INFO - Starting silver layer processing
2025-04-08 14:32:09,485 - INFO - Successfully read bronze data version 6
2025-04-08 14:32:09,596 - INFO - Running data quality checks for silver layer
2025-04-08 14:32:16,188 - INFO - Data Quality Metrics for silver layer:
2025-04-08 14:32:16,190 - INFO -   - Column Count: 19
2025-04-08 14:32:16,191 - INFO -   - Columns: ['transaction_id', 'transaction_timestamp', 'customer_id', 'account_number', 'transaction_type', 'amount', 'currency', 'balance_after', 'status', 'merchant', 'category', 'location', 'ingestion_timestamp', 'source_file', 'batch_id', 'transaction_date', 'transaction_time', 'year_month', 'processing_timestamp']
2025-04-08 14:32:16,195 - INFO -   - Sample size: 278
2025-04-08 14:32:16,197 - INFO - Validating dataframe with 3 rules
2025-04-08 14:32:19,256 - INFO - Validation rule 'valid_transaction_type' FAILED - 113 records (40.65%) failed
2025-04-08 14:32:19,436 - INFO - Validation rule 'valid_status' FAILE

In [10]:
silverdf.show()

+--------------+---------------------+-----------+--------------+----------------+-------+--------+-------------+---------+--------------------+-------------+-----------------+--------------------+--------------------+--------------------+----------------+----------------+----------+--------------------+
|transaction_id|transaction_timestamp|customer_id|account_number|transaction_type| amount|currency|balance_after|   status|            merchant|     category|         location| ingestion_timestamp|         source_file|            batch_id|transaction_date|transaction_time|year_month|processing_timestamp|
+--------------+---------------------+-----------+--------------+----------------+-------+--------+-------------+---------+--------------------+-------------+-----------------+--------------------+--------------------+--------------------+----------------+----------------+----------+--------------------+
|   TXN00000001|  2023-06-05 11:37:36| CUST001040| ACCT-86152351|          refund|

Corrupt "status" column

In [11]:
silverdf.groupBy('status').count().show()

+---------+-----+
|   status|count|
+---------+-----+
|  pendlng|    1|
|completed|  264|
| dioputed|    1|
| dirputed|    1|
|completpd|    1|
|   failed|  285|
|     NULL|   13|
|completei|    1|
|  pjnding|    1|
| oeversed|    1|
|  sending|    1|
| disiuted|    1|
|  pendinz|    1|
| repersed|    1|
| disptted|    1|
|  pendiny|    1|
|complethd|    1|
|  pending|  269|
|   faihed|    1|
|  pedding|    1|
+---------+-----+
only showing top 20 rows

