In [0]:
# STEP 0 — Setup
# These variables store the paths for each stage of the pipeline

# Landing folders: raw files exactly as they arrive
LANDING_ORDERS    = "dbfs:/FileStore/tables/dlt/landing/orders"
LANDING_CUSTOMERS = "dbfs:/FileStore/tables/dlt/landing/customers"

# Silver folder: cleaned data in Delta format
DELTA_SILVER_PATH = "dbfs:/tmp/delta/sil_orders"

# SQL table name pointing to the silver Delta folder
DELTA_TABLE_NAME  = "sil_orders_tbl"

print("We will store data in:")
print(f"Landing Orders folder:    {LANDING_ORDERS}")
print(f"Landing Customers folder: {LANDING_CUSTOMERS}")
print(f"Silver Delta folder:      {DELTA_SILVER_PATH}")
print(f"SQL Table name:           {DELTA_TABLE_NAME}")

We will store data in:
Landing Orders folder:    dbfs:/FileStore/tables/dlt/landing/orders
Landing Customers folder: dbfs:/FileStore/tables/dlt/landing/customers
Silver Delta folder:      dbfs:/tmp/delta/sil_orders
SQL Table name:           sil_orders_tbl


In [0]:
from delta import *
import pyspark.sql.functions as F
from pyspark.sql import types as T

print("STEP 1: Seeding inline data to landing (JSON) ...")

orders_rows = [
    (1, "C001", "2025-08-08 09:00:00", 12000, "placed"),
    (2, "C002", "2025-08-08 09:05:00",  4500, "placed"),
    (3, "C001", "2025-08-08 09:10:00", 22000, "cancelled"),
    (4, "C003", "2025-08-08 09:15:00",   800, "placed")
]
customers_rows = [
    ("C001", "Ananya", "Bengaluru"),
    ("C002", "Rahul",  "Hyderabad"),
    ("C003", "Meera",  "Pune")
]

orders_schema = T.StructType([
    T.StructField("order_id",    T.IntegerType()),
    T.StructField("customer_id", T.StringType()),
    T.StructField("order_ts",    T.StringType()),
    T.StructField("amount",      T.IntegerType()),
    T.StructField("status",      T.StringType())
])
cust_schema = T.StructType([
    T.StructField("customer_id", T.StringType()),
    T.StructField("name",        T.StringType()),
    T.StructField("city",        T.StringType())
])

orders_df = (spark.createDataFrame(orders_rows, orders_schema)
             .withColumn("order_ts", F.to_timestamp("order_ts")))
customers_df = spark.createDataFrame(customers_rows, cust_schema)

orders_df.write.mode("overwrite").json(LANDING_ORDERS)
customers_df.write.mode("overwrite").json(LANDING_CUSTOMERS)

print("✅ Seeded landing JSON:")
print(f"  {LANDING_ORDERS}")
print(f"  {LANDING_CUSTOMERS}")


STEP 1: Seeding inline data to landing (JSON) ...
✅ Seeded landing JSON:
  dbfs:/FileStore/tables/dlt/landing/orders
  dbfs:/FileStore/tables/dlt/landing/customers
