In [0]:
from pyspark import pipelines as dp
from pyspark.sql.functions import col,current_timestamp

In [0]:
@dp.table(
    name = "bronze_addresses_py",
    table_properties = {"quality" : "bronze"},
    comment = "this is bronze table"
)
def bronze_addresses_py():
    return(
           spark.readStream.format("cloudFiles")
                           .option("cloudFiles.Format", "csv")
                           .option("cloudFiles.inferColumnTypes", "true")
                           .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
                           .load("/Volumes/circuitbox/landing/operationaldata/addresses/")
                           .withColumn("input_file_path",col("_metadata.file_path"))
                           .withColumn("ingested_time",current_timestamp())

       )

In [0]:
@dp.table(
    name = "silver_adresses_clean_py",
    table_properties = {"quality" : "silver"},
    comment = "this contains silver table data"
)
@dp.expect_or_fail("valid_customer_id", "customer_id is not null")
@dp.expect_or_drop("valid_address_id","address_line_1 is not null")
@dp.expect("valid_postcode","length(postcode) >= 5")
def create_silver_address_clean():
    return (
        spark.readStream.table("bronze_addresses_py")
             .select("customer_id","address_line_1","city","state","postcode",col("created_date").cast("date"))
    )
 

In [0]:
dp.create_streaming_table(
    name = "silver_addresses_py",
    table_properties = {"quality" : "silver"},
    comment= "this is silver table"
)

In [0]:
dp.create_auto_cdc_flow(
  target = "silver_addresses_py",
  source = "silver_adresses_clean_py",
  keys = ["customer_id"],
  sequence_by = "created_date",
  stored_as_scd_type = 2,
)