In [0]:
from pyspark import pipelines as dp
from pyspark.sql.functions import col,current_timestamp,explode

In [0]:
@dp.table(
    name = "bronze_orders_py",
    comment = "this is bronze table",
    table_properties = {"quality" : "bronze"}
)
def create_bronze_orders():
    return(
        spark.readStream.format("cloudFiles")
             .option("cloudFiles.format","json")
             .option("cloudFiles.inferColumnTypes","true")
             .load("/Volumes/circuitbox/landing/operationaldata/orders/")
             .withColumn("input_file_path",col("_metadata.file_path"))
             .withColumn("ingestion_time",current_timestamp())
    )

In [0]:
@dp.table(
    name = "silver_orders_clean_py",
    table_properties = {"quality" : "silver"}
)
@dp.expect_all_or_fail({"validate_customer_id" : "customer_id is not null","validate_order_id" : "order_id is not null"})
@dp.expect_all({"validate_payment_method" : "payment_method in ('Credit Card','Bank Transfer','Paypal')","validate_order_status":"order_status in ('Completed','Shipped','Cancelled','Pending')"})
def create_silver_orders_clean():
    df = spark.readStream.table("live.bronze_orders_py").withColumn("items",explode(col("items")))
    df2 = df.select("customer_id","order_id","order_status","payment_method",col("order_timestamp").cast("timestamp"),col("items.category"),col("items.item_id"),col("items.price"),col("items.name"),col("items.quantity"))
    return df2