In [0]:
%run ../../config/config 

In [0]:
%run ../../config/sqlconfig

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType, StructField, DecimalType
from pyspark.sql.functions import col, lit,filter
import os

In [0]:
class SalesStreamingETL:
    """
    Class to handle Sales Streaming ETL using Auto Loader
    """

    def __init__(self, base_path: str, checkpoint_path: str, schema_path: str):
        self.spark = spark
        self.base_path = base_path
        self.checkpoint_path = checkpoint_path
        self.schema_path = schema_path

        # Paths
        self.schema_path = f"{self.schema_path}/customer_schema"
        self.checkpoint_path = f"{self.checkpoint_path}/customer_table"
        self.source_path = f"{base_path}/customer/"

        # Schema
        self.schema = self._define_schema()

    # -------------------------------
    # Define schema
    # -------------------------------
    def _define_schema(self) -> StructType:
        return StructType([
            StructField("customer_key", StringType(), False),
            StructField("customer_name", StringType(), True),
            StructField("gender", StringType(), True),
            StructField("city", StringType(), True)
        ])

    # -------------------------------
    # Read streaming data
    # -------------------------------
    def read_stream(self):
        return (
            self.spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", f"{self.schema_path}/customer_schema")
            .option("delimiter", ',')
            .schema(self.schema)
            .load(f"{self.source_path}")
        )

    # -------------------------------
    # Data Cleaning
    # -------------------------------
    def clean_streaming_df(self, df):
        """
        Clean streaming DataFrame by:
        1. Removing header row if present
        2. Removing exact duplicate rows
        """
        try:            
            return df.dropDuplicates()
        except Exception as e:
            print(f"Error cleaning streaming DataFrame: {e}")

    def clean_header_df(self, df, header_col: str):
        """
        Clean streaming DataFrame by:
        1. Removing header row if present
        2. Removing exact duplicate rows
        """
        try:            
            withoutduplicate =  df.filter(df[header_col] != header_col)
            return withoutduplicate
        except Exception as e:
            print(f"Error cleaning streaming DataFrame: {e}")

    # -------------------------------
    # Write to Delta Table (Streaming)
    # -------------------------------
    def write_stream(self, df):
        try:
            query =  (
            df.writeStream
            .option("checkpointLocation", f"{self.checkpoint_path}/bronze_customer")
            .trigger(availableNow=True)
            .toTable("workspace.etl_practice.bronze_customer")
            )
            query.awaitTermination()
        except Exception as e:
            print(f"Streaming job failed: {e}")

    # -------------------------------
    # Run ETL
    # -------------------------------
    def run(self): 
        raw_df = self.read_stream()
        cleaned_df = self.clean_streaming_df(raw_df)
        cleaned_df = self.clean_header_df(cleaned_df, 'customer_key')
        self.write_stream(cleaned_df)


In [0]:
# -------------------------------
# Config / Parameters
# -------------------------------
source_path = f"{base_path}"
checkpoint_path = f"{checkpoint_path}/customer_table"
schema_path = f"{base_path}/customer_schema"


In [0]:
obj = SalesStreamingETL(source_path, checkpoint_path, schema_path)
obj.run()

In [0]:
%sql
select * from bronze_customer order by customer_key

In [0]:
bronze_customer_df = spark.table("workspace.etl_practice.bronze_customer")

In [0]:
bronze_customer_df.display()