Setup and imports

In [1]:
import logging
import requests
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, lit, current_timestamp
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import to_date
import shutil
import os


Spark initialization

In [None]:
spark = SparkSession.builder \
    .appName("AdvancedETL") \
    .config("spark.jars", "/app/jars/mssql-jdbc-12.10.1.jre11.jar") \
    .getOrCreate()


Logs handling

In [None]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("/home/jovyan/etl_pipeline_error_log.log"),
        logging.StreamHandler()
    ]
)


Load source data

In [None]:

sales_df = spark.read.option("header", True).csv("data/sales_data_2.csv")
sales_df.show(5)

product_df = spark.read.option("header", True).csv("data/product_reference_2.csv")
product_df.show(5)

Null handling

In [None]:
sales_df.filter(col("SaleAmount").isNull()).show()  # Check rows where SaleAmount is null

sales_df.filter(col("OrderDate").isNull()).show()   # Check rows where OrderDate is null

sales_df.filter((col("SaleAmount").isNotNull()) & (col("OrderDate").isNotNull())).show(5)


Duplicate removal

In [None]:
sales_df_clean = (
    sales_df
    .dropna(subset=["SaleAmount", "OrderDate"])
    .dropDuplicates(["OrderID"])
    .withColumn("OrderDateParsed", to_date("OrderDate", "MM/dd/yyyy")) 
    .filter(
        (col("SaleAmount").cast("double").isNotNull()) &
        (col("OrderDateParsed").isNotNull())
    )
)

sales_df_clean.show(5) 


Lookup: Join with product reference

In [None]:
enriched_df = sales_df_clean.join(product_df, on="ProductID", how="left")
print(f"[INFO] enriched_df row count: {enriched_df.count()}")
enriched_df.show(5) 


Currency conversion via API

In [None]:
def get_exchange_rates():
    try:
        url = "https://api.exchangerate-api.com/v4/latest/USD"
        response = requests.get(url)
        return response.json().get("rates", {})
    except Exception as e:
        logging.error(f"Exchange rate API failed: {e}")
        return {"EUR": 1.0, "GBP": 1.0}

exchange_rates = get_exchange_rates()
broadcast_rates = spark.sparkContext.broadcast(exchange_rates)


@udf(DoubleType())
def convert_to_usd(amount, currency):
    try:
        rate = broadcast_rates.value.get(currency, 1.0)
        return float(amount) / float(rate)
    except Exception as e:
        logging.error(f"Conversion error: amount={amount}, currency={currency}, error={e}")
        return None

converted_df = enriched_df.withColumn("SaleAmountUSD", convert_to_usd(col("SaleAmount"), col("Currency")))
print(f"[INFO] converted_df row count: {converted_df.count()}")
converted_df.show(5) 


Logging conversion info

In [None]:
conversion_log_df = converted_df.withColumn("ConversionTime", current_timestamp()) \
    .select("OrderID", "Currency", "SaleAmount", "SaleAmountUSD", "ConversionTime")

log_path = "/app/logs/conversion_log"

# Clean entire log directory if it exists
if os.path.exists(log_path):
    try:
        shutil.rmtree(log_path)  # deletes folder and contents
        print(f"Deleted old log directory at {log_path}")
    except Exception as e:
        print(f"[WARN] Failed to delete log directory: {e}")

# Spark will create this folder fresh
conversion_log_df.coalesce(1).write \
    .mode("overwrite") \
    .option("header", True) \
    .csv(log_path)


Error handling with trashold 

In [None]:
error_df = converted_df.filter(col("SaleAmountUSD").isNull())
error_df = error_df.withColumn("ErrorReason", lit("Invalid currency or amount")) \
                   .withColumn("RejectedAt", current_timestamp())
error_df.write.mode("overwrite").option("header", True).csv("rejected/rejected_records.csv")
error_df.show(5)


error_rate = error_df.count() / converted_df.count()
if error_rate > 0.05:
    raise Exception(f"[ERROR] Rejected records exceed 5% threshold ({error_rate*100:.2f}%)")

Final clean data

In [None]:
final_df = converted_df.filter(col("SaleAmountUSD").isNotNull())

Write to SQL Database

In [None]:
jdbc_url = "jdbc:sqlserver://host.docker.internal:1433;databaseName=SalesDB;encrypt=true;trustServerCertificate=true"

db_props = {
    "user": "sa",
    "password": "qwe123!@#$",
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

final_df.write.jdbc(url=jdbc_url, table="SalesEnriched", mode="append", properties=db_props)


Wrire rejected records to SQL for tracking 

In [None]:
error_df.write.jdbc(url=jdbc_url, table="RejectedRecords", mode="append", properties=db_props)