In [0]:
%run ../../config/config 

In [0]:
%run ../../config/sqlconfig

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType, StructField
from pyspark.sql.functions import col, lit

In [0]:
# -------------------------------
# Config / Parameters
# -------------------------------
schema_path = f"{base_path}/dev_table_schema"
checkpoint_path = f"{checkpoint_path}/dev_table"
source_path = f"{base_path}/books-csv"

# -------------------------------
# Define schema
# -------------------------------
json_schema = StructType([
    StructField("book_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("author", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", IntegerType(), True)
])

# -------------------------------
# Read streaming data
# -------------------------------
books_df =  spark.readStream\
    .format("cloudFiles")\
    .option("cloudFiles.format", "csv")\
    .option("header", "false")\
    .option("cloudFiles.schemaLocation", schema_path)\
    .option("delimiter", ";")\
    .schema(json_schema)\
    .load(source_path)

# -------------------------------
# Data Cleaning
# -------------------------------
def clean_streaming_df(df, header_col: str):
    """
    Clean streaming DataFrame by:
    1. Removing header row if present
    2. Removing exact duplicate rows
    """
    return df.filter(col(header_col) != header_col).dropDuplicates()

cleaned_df = clean_streaming_df(books_df, "book_id")

# -------------------------------
# Write to Delta Table (Streaming)
# -------------------------------
(    
    cleaned_df.writeStream
    .option("checkpointLocation", checkpoint_path)
    .trigger(availableNow=True)
    .toTable("workspace.etl_practice.dev_table")
)

In [0]:
#dbutils.fs.rm("/Volumes/workspace/etl_practice/my_file/my_file/_checkpoints/", recurse=True)

In [0]:
#print(base_path)

In [0]:
#dbutils.fs.rm(f"{base_path}/dev_table_schema", recurse=True)

In [0]:
%sql
-- drop table dev_table

In [0]:
%sql
select * from dev_table