In [0]:
from pyspark.sql.functions import col, explode
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, ArrayType
from delta.tables import DeltaTable

bronze_table = "oliv_mitai_uc.bronze.olive_mitai_sales"
silver_table = "oliv_mitai_uc.silver.sales_silver"
invalid_table = "oliv_mitai_uc.silver.invalid_sales"

schema = StructType([
    StructField("bill_datetime", StringType(), True),
    StructField("bill_no", StringType(), True),
    StructField("cashier_code", StringType(), True),
    StructField("counter_name", StringType(), True),
    StructField("items", ArrayType(StructType([
        StructField("amount", DoubleType(), True),
        StructField("hsn_code", StringType(), True),
        StructField("product_name", StringType(), True),
        StructField("quantity", DoubleType(), True),
        StructField("rate", LongType(), True),
        StructField("tax_amount", DoubleType(), True),
        StructField("tax_percent", DoubleType(), True)
    ])), True),
    StructField("payment_method", StringType(), True),
    StructField("store_code", StringType(), True),
    StructField("total_amount", DoubleType(), True)
])

df_raw = spark.table(bronze_table)

df_flat = (
    df_raw
    .withColumn("item", explode("items"))
    .select(
        col("bill_datetime"),
        col("bill_no"),
        col("cashier_code"),
        col("counter_name"),
        col("payment_method"),
        col("store_code"),
        col("total_amount"),
        col("item.amount").alias("amount"),
        col("item.hsn_code").alias("hsn_code"),
        col("item.product_name").alias("product_name"),
        col("item.quantity").alias("quantity"),
        col("item.rate").alias("rate"),
        col("item.tax_amount").alias("tax_amount"),
        col("item.tax_percent").alias("tax_percent")
    )
)

df_valid = df_flat.filter(
    col("bill_no").isNotNull() &
    col("store_code").isNotNull() &
    (col("total_amount") > 0) &
    (col("quantity") > 0)
)

df_invalid = df_flat.subtract(df_valid)

def ensure_table_exists(table_name, df):
    tables = [t.name for t in spark.catalog.listTables("oliv_mitai_uc.silver")]
    if table_name.split('.')[-1] not in tables:
        df.limit(0).write.format("delta").saveAsTable(table_name)

ensure_table_exists(silver_table, df_valid)
ensure_table_exists(invalid_table, df_invalid)

silver_delta = DeltaTable.forName(spark, silver_table)
invalid_delta = DeltaTable.forName(spark, invalid_table)

silver_delta.alias("t").merge(
    df_valid.alias("s"),
    "t.bill_no = s.bill_no AND t.product_name = s.product_name"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

invalid_delta.alias("t").merge(
    df_invalid.alias("s"),
    "t.bill_no = s.bill_no AND t.product_name = s.product_name"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()


In [0]:
%sql
select * from oliv_mitai_uc.silver.sales_silver;

In [0]:
%sql
select * from oliv_mitai_uc.silver.invalid_sales;