In [1]:

import os
import sys
# Set JAVA env variable
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-11.0.26.4-hotspot"
# Set Hadoop environment variables 
os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['PATH'] = os.environ['HADOOP_HOME'] + r'\bin;' + os.environ['PATH']
# Set the Python executable path explicitly
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
import numpy as np
import pandas as pd


from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (StructType, StructField, StringType, 
                            DoubleType, IntegerType, TimestampType, 
                            DateType)
from delta.tables import DeltaTable
import time
import logging
import os
from datetime import datetime

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [4]:
from de_tools import *

In [5]:
spark = initialize_spark()

2025-04-08 12:57:52,219 - INFO - Spark session initialized with Delta Lake support


In [6]:
data_dir = "../../data"

source_path     = f"{data_dir}/financial_transactions.csv"
bronze_dir      = f"{data_dir}/bronze/financial_osb"
silver_dir      = f"{data_dir}/silver/financial_osb"
gold_dir        = f"{data_dir}/gold/financial_osb"

# Create directories if they don't exist
for dir_path in [bronze_dir, silver_dir, gold_dir]:
    os.makedirs(dir_path, exist_ok=True)

source_path  = os.path.abspath(os.path.join(os.getcwd(), *source_path.split('/')))
bronze_path  = os.path.abspath(os.path.join(os.getcwd(), *bronze_dir.split('/')))
silver_path  = os.path.abspath(os.path.join(os.getcwd(), *silver_dir.split('/')))
gold_path    = os.path.abspath(os.path.join(os.getcwd(), *gold_dir.split('/')))


# Run the pipeline
# result = run_financial_pipeline(SOURCE_PATH, BRONZE_PATH, SILVER_PATH, GOLD_PATH)
# print(f"Pipeline result: {result}")

In [7]:
bronze_path

'c:\\GitHub\\DE_Pipelines\\data\\bronze\\financial_osb'

In [8]:
def get_schema():
    return StructType([
        StructField("transaction_id", StringType(), True),
        StructField("timestamp", StringType(), True), # need to process as StringType and convert to Timestamp
        StructField("customer_id", StringType(), True),
        StructField("account_number", StringType(), True),
        StructField("transaction_type", StringType(), True),
        StructField("amount", DoubleType(), True),
        StructField("currency", StringType(), True),
        StructField("balance_after", DoubleType(), True),
        StructField("status", StringType(), True),
        StructField("merchant", StringType(), True),
        StructField("category", StringType(), True),
        StructField("location", StringType(), True)
    ])

In [9]:
run_batch_de_pipeline(source_path, bronze_path, get_schema(), silver_path, gold_path, pipeline_name='Financial_Data_Pipeline')

2025-04-08 12:58:03,499 - INFO - Spark session initialized with Delta Lake support
2025-04-08 12:58:05,760 - INFO - Starting data pipeline execution with ID: Financial_Data_Pipeline_20250408_125805
2025-04-08 12:58:05,762 - INFO - Starting bronze layer processing for c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-08 12:58:08,768 - INFO - Successfully read CSV data from c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-08 12:58:39,393 - INFO - Successfully wrote data to bronze layer at c:\GitHub\DE_Pipelines\data\bronze\financial_osb
2025-04-08 12:58:40,728 - INFO - Running data quality checks for bronze layer
2025-04-08 12:58:42,191 - INFO - Data Quality Metrics for bronze layer:
2025-04-08 12:58:42,193 - INFO -   - Column Count: 15
2025-04-08 12:58:42,196 - INFO -   - Columns: ['transaction_id', 'transaction_timestamp', 'customer_id', 'account_number', 'transaction_type', 'amount', 'currency', 'balance_after', 'status', 'merchant', 'category', 'location', 

{'status': 'success',
 'pipeline_id': 'Financial_Data_Pipeline_20250408_125805',
 'bronze_version': 12,
 'silver_version': 2,
 'timestamp': '2025-04-08T13:00:53.737987',
 'duration_seconds': 170.23650288581848,
 'metrics': {'pipeline_id': 'Financial_Data_Pipeline_20250408_125805',
  'start_time': '2025-04-08T12:58:05.760857',
  'stages': {'bronze': {'duration_seconds': 38.96899747848511,
    'version': 12,
    'status': 'success'},
   'bronze_optimize': {'layer': 'bronze',
    'duration_seconds': 5.960385322570801,
    'status': 'success'},
   'silver': {'duration_seconds': 27.304896116256714,
    'version': 2,
    'status': 'success',
    'source_bronze_version': 12},
   'silver_optimize': {'layer': 'bronze',
    'duration_seconds': 14.798668146133423,
    'status': 'success'},
   'gold': {'duration_seconds': 75.17902708053589,
    'status': 'success',
    'source_silver_version': 2,
    'tables': ['daily_category',
     'customer_summary',
     'transaction_type_summary']},
   'gold_

In [10]:
# Generate a unique pipeline ID
pipeline_id = f"financial_pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
bronze_df, bronze_version = process_batch_bronze_layer(spark, source_path, get_schema(), bronze_path, pipeline_id)

2025-04-08 12:46:45,640 - INFO - Starting bronze layer processing for c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-08 12:46:49,827 - INFO - Successfully read CSV data from c:\GitHub\DE_Pipelines\data\financial_transactions.csv
2025-04-08 12:47:19,593 - INFO - Successfully wrote data to bronze layer at c:\GitHub\DE_Pipelines\data\bronze\financial_osb
2025-04-08 12:47:21,263 - INFO - Running data quality checks for bronze layer
2025-04-08 12:47:22,521 - INFO - Data Quality Metrics for bronze layer:
2025-04-08 12:47:22,523 - INFO -   - Column Count: 15
2025-04-08 12:47:22,525 - INFO -   - Columns: ['transaction_id', 'transaction_timestamp', 'customer_id', 'account_number', 'transaction_type', 'amount', 'currency', 'balance_after', 'status', 'merchant', 'category', 'location', 'ingestion_timestamp', 'source_file', 'batch_id']
2025-04-08 12:47:22,527 - INFO -   - Sample size: 316
2025-04-08 12:47:22,538 - INFO - Validating dataframe with 3 rules
2025-04-08 12:47:23,220 - I

In [11]:
bronze_df.show(2)

+--------------+---------------------+-----------+--------------+----------------+-------+--------+-------------+---------+----------+---------+-------------+--------------------+--------------------+--------------------+
|transaction_id|transaction_timestamp|customer_id|account_number|transaction_type| amount|currency|balance_after|   status|  merchant| category|     location| ingestion_timestamp|         source_file|            batch_id|
+--------------+---------------------+-----------+--------------+----------------+-------+--------+-------------+---------+----------+---------+-------------+--------------------+--------------------+--------------------+
|   TXN00000001|  2023-06-05 11:37:36| CUST001040| ACCT-86152351|          refund| 598.65|     USD|      2900.03| disputed|Mastercard|education|   Denver, CO|2025-04-08 12:47:...|file:///c:/GitHub...|financial_pipelin...|
|   TXN00000002|  2023-06-04 15:42:51|       NULL| ACCT-52201156|        transfer|-908.13|     USD|      5944.76

In [12]:
silver_df, silver_version = process_batch_silver_layer(
            spark, bronze_path, silver_path, pipeline_id, bronze_version
        )

2025-04-08 12:47:24,392 - INFO - Starting silver layer processing
2025-04-08 12:47:24,504 - INFO - Successfully read bronze data version 11
2025-04-08 12:47:36,935 - INFO - Successfully wrote data to silver layer at c:\GitHub\DE_Pipelines\data\silver\financial_osb
2025-04-08 12:47:37,418 - INFO - Running data quality checks for silver layer
2025-04-08 12:47:42,913 - INFO - Data Quality Metrics for silver layer:
2025-04-08 12:47:42,914 - INFO -   - Column Count: 19
2025-04-08 12:47:42,917 - INFO -   - Columns: ['transaction_id', 'transaction_timestamp', 'customer_id', 'account_number', 'transaction_type', 'amount', 'currency', 'balance_after', 'status', 'merchant', 'category', 'location', 'ingestion_timestamp', 'source_file', 'batch_id', 'transaction_date', 'transaction_time', 'year_month', 'processing_timestamp']
2025-04-08 12:47:42,919 - INFO -   - Sample size: 283
2025-04-08 12:47:42,922 - INFO - Validating dataframe with 3 rules
2025-04-08 12:47:47,006 - INFO - Validation rule 'vali

In [13]:
silver_df.show(2)

+--------------+---------------------+-----------+--------------+----------------+------+--------+-------------+--------+----------+---------+--------------+--------------------+--------------------+--------------------+----------------+----------------+----------+--------------------+
|transaction_id|transaction_timestamp|customer_id|account_number|transaction_type|amount|currency|balance_after|  status|  merchant| category|      location| ingestion_timestamp|         source_file|            batch_id|transaction_date|transaction_time|year_month|processing_timestamp|
+--------------+---------------------+-----------+--------------+----------------+------+--------+-------------+--------+----------+---------+--------------+--------------------+--------------------+--------------------+----------------+----------------+----------+--------------------+
|          NULL|  2023-06-28 10:49:34| CUST001192| ACCT-99929834|      WITHDRAWAL|597.32|     USD|      2365.93|DISPUTED|      NULL|     NU

In [14]:
gold_dfs = process_batch_gold_layer(
            spark, silver_path, gold_path, pipeline_id, silver_version
        )

2025-04-08 12:47:49,407 - INFO - Starting gold layer processing
2025-04-08 12:47:49,447 - INFO - Successfully read silver data version 1
2025-04-08 12:47:57,844 - INFO - Successfully wrote gold table daily_category to c:\GitHub\DE_Pipelines\data\gold\financial_osb/daily_category
2025-04-08 12:47:58,254 - INFO - Running data quality checks for gold_daily_category layer
2025-04-08 12:48:02,244 - INFO - Data Quality Metrics for gold_daily_category layer:
2025-04-08 12:48:02,245 - INFO -   - Column Count: 9
2025-04-08 12:48:02,247 - INFO -   - Columns: ['transaction_date', 'category', 'transaction_count', 'total_amount', 'avg_amount', 'min_amount', 'max_amount', 'unique_customers', 'processing_timestamp']
2025-04-08 12:48:02,248 - INFO -   - Sample size: 521
2025-04-08 12:48:02,251 - INFO - Validating dataframe with 2 rules
2025-04-08 12:48:06,142 - INFO - Validation rule 'positive_transaction_counts' PASSED - 0 records (0.00%) failed
2025-04-08 12:48:06,368 - INFO - Validation rule 'valid