In [0]:
%run ../../config/config

In [0]:
%run  ../../config/sqlconfig

In [0]:
from pyspark.sql.types import IntegerType, StringType, StructField, StructType, DecimalType
from pyspark.sql.functions import col, lit, filter
import os

In [0]:
class ProductBronzeStreamingETL:
    def __init__(self):
        self.spark = spark
        self.sourch_path = base_path
        self.catalog = catalog
        self.schema_name = schema_name
        self.table_name = bronze_product_tbl       

        # Paths
        self.schema_path = f"{schema_path}/_bronze_product_schema"        
        self.checkpoint_path = f"{checkpoint_path}/_bronze_product_checkpoint"
        self.source_path = f"{base_path}/product"      

        # Schema
        self.schema = self._define_schema()
    # ----------------------------
    # Define schema
    # ----------------------------
    def _define_schema(self) -> StructType:
        return StructType([
            StructField('product_sk', StringType(), True),
            StructField('product_key', StringType(), True),
            StructField('Product_name', StringType(), True),
            StructField('category', StringType(), True),
            StructField('brand', StringType(), True),
            StructField('price', StringType(), True),
            StructField('effective_start_date', StringType(), True),
            StructField('effective_end_date', StringType(), True),
            StructField('is_current', StringType(), True)
        ])

    # -----------------------------
    # Read streaming data
    # -----------------------------
    def read_stream(self):
        return (
            self.spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", '/Volumes/workspace/etl_practice/my_file/_schemas')
            .option("delimiter", ",")
            .schema(self.schema)
            .load(self.source_path)
        )
    # -------------------------------
    # Data Cleaning
    # -------------------------------
    def clean_streaming_df(self, df):
        """
        1. Removing exact duplicte rows
        """
        try:
            return df.dropDuplicates(['product_sk'])
        except Exception as e:
            print(f"Error: clean_streaming_df : {e}")
    
    def clean_header_df(self, df, header_col: str):
        """
        1. Removing header row
        """
        try:
            return df.filter(col(header_col) !='product_sk')
        except Exception as e:
            print(f"Error: clean_header_df : {e}")

    # -----------------------
    # Write to delta table (Streaming)
    # -----------------------
    def write_stream(self, df):
        try:
            query = (
                df.writeStream
                .option("checkpointLocation", f"{self.checkpoint_path}")
                .trigger(availableNow=True)
                .toTable(f"{self.catalog}.{self.schema_name}.{self.table_name}")
            )
            query.awaitTermination()
        except Exception as e:
            print(f"Error: write_stream : {e}")

    def testing_df(self):
        df = self.spark.read.format("csv").load("/Volumes/workspace/etl_practice/my_file/product/dim_product.csv")
        return df
    def console_test(self, df):
        df.writeStream \
        .format("console") \
        .option("checkpointLocation", f"{self.checkpoint_path}/_debug") \
        .trigger(availableNow=True) \
        .start() \
        .awaitTermination()
    def clean_schema(self):
        dbutils.fs.rm(self.checkpoint_path, recurse=True)
        dbutils.fs.rm(self.schema_path, recurse=True)
        self.spark.sql(f"drop table {self.catalog}.{self.schema_name}.{self.table_name}")
    
    def display_tbl(self):
        self.spark.sql(f"SELECT *FROM {self.catalog}.{self.schema_name}.{self.table_name}").display()

    def run(self):
        raw_df = self.read_stream()
        cleaned_df = self.clean_streaming_df(raw_df)
        cleaned_df = self.clean_header_df(cleaned_df, 'product_sk')
        self.write_stream(cleaned_df)
        #self.testing_df()
        #self.console_test(raw_df)
        print(f"Successful write {self.table_name}")

In [0]:
obj = ProductBronzeStreamingETL()
raw_df = obj.run()

In [0]:
obj.display_tbl()

In [0]:
#obj.clean_schema()