In [0]:

pip install Faker

In [0]:
dbutils.library.restartPython()

# Orders Dataset Preparation

In [0]:
catalog = "workspace"
schema = dbName = db = "retail_dlt"
volume_name = "raw_data"

spark.sql(f'CREATE CATALOG IF NOT EXISTS `{catalog}`')
spark.sql(f'USE CATALOG `{catalog}`')
spark.sql(f'CREATE SCHEMA IF NOT EXISTS `{catalog}`.`{schema}`')
spark.sql(f'USE SCHEMA `{schema}`')
spark.sql(f'CREATE VOLUME IF NOT EXISTS `{catalog}`.`{schema}`.`{volume_name}`')
volume_folder =  f"/Volumes/{catalog}/{db}/{volume_name}"


try:
  dbutils.fs.ls(volume_folder + "/orders")
except:
  print(f"folder doesn't exist, generating the data under {volume_folder}...")
  from pyspark.sql import functions as F
  from faker import Faker
  import uuid
  import random

  fake = Faker()
  fake_order_id = F.udf(lambda: str(uuid.uuid4()))
  fake_order_date = F.udf(lambda: fake.date_time_this_year().strftime("%m-%d-%Y %H:%M:%S"))
  fake_product = F.udf(fake.word)
  fake_amount = F.udf(lambda: round(random.uniform(10, 1000), 2))
  fake_status = F.udf(lambda: random.choice(["PENDING", "SHIPPED", "DELIVERED", "CANCELLED"]))

  # Load customer ids from the customers file
  customers_df = spark.read.json(volume_folder + "/customers")
  customer_ids = customers_df.select("id").where(F.col("id").isNotNull()).distinct()
  customer_ids_list = [row["id"] for row in customer_ids.collect()]

  def random_customer_id():
    return random.choice(customer_ids_list) if customer_ids_list else None

  fake_customer_id = F.udf(random_customer_id)

  orders_df = spark.range(0, 50000).repartition(50)
  orders_df = orders_df.withColumn("order_id", fake_order_id())
  orders_df = orders_df.withColumn("customer_id", fake_customer_id())
  orders_df = orders_df.withColumn("order_date", fake_order_date())
  orders_df = orders_df.withColumn("product", fake_product())
  orders_df = orders_df.withColumn("amount", fake_amount())
  orders_df = orders_df.withColumn("status", fake_status())

  orders_df.select("order_id", "customer_id", "order_date", "product", "amount", "status") \
    .repartition(1) \
    .write \
    .format("csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save(volume_folder + "/orders")

In [0]:
#/Volumes/workspace/retail_dlt/raw_data/customers/
/Volumes/workspace/retail_dlt/raw_data/orders/part-00000-tid-2447337226612876626-c1324c15-67e6-498e-a00d-79a5025341a7-202-1-c000.csv

In [0]:
%sql
select * from workspace.retail_dlt.orders_bronze

# Incremental File with Schema Evolution

In [0]:
from pyspark.sql import functions as F
from faker import Faker
import uuid
import random

fake = Faker()
fake_order_id = F.udf(lambda: str(uuid.uuid4()))
fake_order_date = F.udf(lambda: fake.date_time_this_year().strftime("%m-%d-%Y %H:%M:%S"))
fake_product = F.udf(fake.word)
fake_amount = F.udf(lambda: round(random.uniform(10, 1000), 2))
fake_status = F.udf(lambda: random.choice(["PENDING", "SHIPPED", "DELIVERED", "CANCELLED"]))

# Load customer ids from the customers file
customers_df = spark.read.json(volume_folder + "/customers")
customer_ids = customers_df.select("id").where(F.col("id").isNotNull()).distinct()
customer_ids_list = [row["id"] for row in customer_ids.collect()]

def random_customer_id():
    return random.choice(customer_ids_list) if customer_ids_list else None

fake_customer_id = F.udf(random_customer_id)

# Generate incremental data with schema evolution (extra columns)
incremental_df = spark.range(50000, 60000).repartition(10)
incremental_df = incremental_df.withColumn("order_id", fake_order_id())
incremental_df = incremental_df.withColumn("customer_id", fake_customer_id())
incremental_df = incremental_df.withColumn("order_date", fake_order_date())
incremental_df = incremental_df.withColumn("product", fake_product())
incremental_df = incremental_df.withColumn("amount", fake_amount())
incremental_df = incremental_df.withColumn("status", fake_status())
# Schema evolution: add new columns
incremental_df = incremental_df.withColumn("shipping_address", F.udf(fake.address)())
incremental_df = incremental_df.withColumn("priority", F.udf(lambda: random.choice(["LOW", "MEDIUM", "HIGH"]))())
incremental_df = incremental_df.withColumn("discount", F.udf(lambda: round(random.uniform(0, 0.3), 2))())

incremental_df.select(
    "order_id", "customer_id", "order_date", "product", "amount", "status",
    "shipping_address", "priority", "discount"
).repartition(1).write.format("csv").option("header", "true").mode("append").save(volume_folder + "/orders")

# Validate the tables data