In [0]:

pip install Faker

In [0]:
dbutils.library.restartPython()

# Customer Data Preparation scripts

In [0]:
catalog = "workspace"
schema = dbName = db = "retail_dlt"
volume_name = "raw_data"

spark.sql(f'CREATE CATALOG IF NOT EXISTS `{catalog}`')
spark.sql(f'USE CATALOG `{catalog}`')
spark.sql(f'CREATE SCHEMA IF NOT EXISTS `{catalog}`.`{schema}`')
spark.sql(f'USE SCHEMA `{schema}`')
spark.sql(f'CREATE VOLUME IF NOT EXISTS `{catalog}`.`{schema}`.`{volume_name}`')
volume_folder =  f"/Volumes/{catalog}/{db}/{volume_name}"

In [0]:
# You can change the catalog, schema, dbName, and db. If you do so, you must also
# change the names in the rest of the tutorial.
catalog = "workspace"
schema = dbName = db = "retail_dlt"
volume_name = "raw_data"

spark.sql(f'CREATE CATALOG IF NOT EXISTS `{catalog}`')
spark.sql(f'USE CATALOG `{catalog}`')
spark.sql(f'CREATE SCHEMA IF NOT EXISTS `{catalog}`.`{schema}`')
spark.sql(f'USE SCHEMA `{schema}`')
spark.sql(f'CREATE VOLUME IF NOT EXISTS `{catalog}`.`{schema}`.`{volume_name}`')
volume_folder =  f"/Volumes/{catalog}/{db}/{volume_name}"

try:
  dbutils.fs.ls(volume_folder+"/customers")
except:
  print(f"folder doesn't exists, generating the data under {volume_folder}...")
  from pyspark.sql import functions as F
  from faker import Faker
  from collections import OrderedDict
  import uuid
  fake = Faker()
  import random

  fake_firstname = F.udf(fake.first_name)
  fake_lastname = F.udf(fake.last_name)
  fake_email = F.udf(fake.ascii_company_email)
  fake_date = F.udf(lambda:fake.date_time_this_month().strftime("%m-%d-%Y %H:%M:%S"))
  fake_address = F.udf(fake.address)
  operations = OrderedDict([("APPEND", 0.5),("DELETE", 0.1),("UPDATE", 0.3),(None, 0.01)])
  fake_operation = F.udf(lambda:fake.random_elements(elements=operations, length=1)[0])
  fake_id = F.udf(lambda: str(uuid.uuid4()) if random.uniform(0, 1) < 0.98 else None)

  df = spark.range(0, 100000).repartition(10)
  df = df.withColumn("id", fake_id())
  df = df.withColumn("firstname", fake_firstname())
  df = df.withColumn("lastname", fake_lastname())
  df = df.withColumn("email", fake_email())
  df = df.withColumn("address", fake_address())
  df = df.withColumn("operation", fake_operation())
  df_customers = df.withColumn("operation_date", fake_date())
  df_customers.repartition(1).write.format("json").mode("overwrite").save(volume_folder+"/customers")
  #/Volumes/workspace/customers/raw_data/customers/

In [0]:
display(spark.read.json(volume_folder+"/customers"))

# Incremental File with Schema Evolution

In [0]:
from pyspark.sql import functions as F
from faker import Faker
from collections import OrderedDict
import uuid
import random

fake = Faker()

fake_firstname = F.udf(fake.first_name)
fake_lastname = F.udf(fake.last_name)
fake_email = F.udf(fake.ascii_company_email)
fake_date = F.udf(lambda: fake.date_time_this_month().strftime("%m-%d-%Y %H:%M:%S"))
fake_address = F.udf(fake.address)
operations = OrderedDict([("APPEND", 0.5), ("DELETE", 0.1), ("UPDATE", 0.3), (None, 0.01)])
fake_operation = F.udf(lambda: fake.random_elements(elements=operations, length=1)[0])
fake_id = F.udf(lambda: str(uuid.uuid4()) if random.uniform(0, 1) < 0.98 else None)
fake_phone = F.udf(fake.phone_number)
fake_country = F.udf(fake.country)
fake_signup_source = F.udf(lambda: random.choice(["Web", "Mobile", "Store", "Referral"]))

# Simulate incremental data (e.g., 10,000 new records)
df_incremental = spark.range(100000, 110000).repartition(1)
df_incremental = df_incremental.withColumn("id", fake_id())
df_incremental = df_incremental.withColumn("firstname", fake_firstname())
df_incremental = df_incremental.withColumn("lastname", fake_lastname())
df_incremental.withColumn("email", fake_email())
df_incremental = df_incremental.withColumn("address", fake_address())
df_incremental = df_incremental.withColumn("operation", fake_operation())
df_incremental = df_incremental.withColumn("operation_date", fake_date())
# Schema evolution: add new columns
df_incremental = df_incremental.withColumn("phone", fake_phone())
df_incremental = df_incremental.withColumn("country", fake_country())
df_incremental = df_incremental.withColumn("signup_source", fake_signup_source())

# Write as a single JSON file with schema evolution
df_incremental.coalesce(1).write.format("json").mode("append").save(volume_folder + "/customers")

# Validate the tables data

In [0]:
%sql
select * from workspace.retail_dlt.customers_bronze

In [0]:
%sql
select * from workspace.customers.customers_silver

In [0]:
%sql
select * from workspace.customers.customers_gold

In [0]:
%sql
select * from workspace.customers.customers_history

In [0]:
%sql
select * from workspace.customers.customers_history_agg

# Orders Dataset Preparation

In [0]:
try:
  dbutils.fs.ls(volume_folder + "/orders")
except:
  print(f"folder doesn't exist, generating the data under {volume_folder}...")
  from pyspark.sql import functions as F
  from faker import Faker
  import uuid
  import random

  fake = Faker()
  fake_order_id = F.udf(lambda: str(uuid.uuid4()))
  fake_order_date = F.udf(lambda: fake.date_time_this_year().strftime("%m-%d-%Y %H:%M:%S"))
  fake_product = F.udf(fake.word)
  fake_amount = F.udf(lambda: round(random.uniform(10, 1000), 2))
  fake_status = F.udf(lambda: random.choice(["PENDING", "SHIPPED", "DELIVERED", "CANCELLED"]))

  # Load customer ids from the customers file
  customers_df = spark.read.json(volume_folder + "/customers")
  customer_ids = customers_df.select("id").where(F.col("id").isNotNull()).distinct()
  customer_ids_list = [row["id"] for row in customer_ids.collect()]

  def random_customer_id():
    return random.choice(customer_ids_list) if customer_ids_list else None

  fake_customer_id = F.udf(random_customer_id)

  orders_df = spark.range(0, 50000).repartition(50)
  orders_df = orders_df.withColumn("order_id", fake_order_id())
  orders_df = orders_df.withColumn("customer_id", fake_customer_id())
  orders_df = orders_df.withColumn("order_date", fake_order_date())
  orders_df = orders_df.withColumn("product", fake_product())
  orders_df = orders_df.withColumn("amount", fake_amount())
  orders_df = orders_df.withColumn("status", fake_status())

  orders_df.select("order_id", "customer_id", "order_date", "product", "amount", "status") \
    .repartition(50) \
    .write \
    .format("csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save(volume_folder + "/orders")

# Products datasets Preparation

In [0]:
try:
  dbutils.fs.ls(volume_folder + "/products")
except:
  print(f"folder doesn't exist, generating the data under {volume_folder}...")
  from pyspark.sql import functions as F
  from faker import Faker
  import uuid
  import random

  fake = Faker()
  fake_product_id = F.udf(lambda: str(uuid.uuid4()))
  fake_product_name = F.udf(fake.word)
  fake_category = F.udf(lambda: random.choice(["Electronics", "Clothing", "Books", "Home", "Toys", "Sports"]))
  fake_price = F.udf(lambda: round(random.uniform(5, 2000), 2))
  fake_stock = F.udf(lambda: random.randint(0, 500))
  fake_description = F.udf(fake.sentence)

  products_df = spark.range(0, 10000).repartition(10)
  products_df = products_df.withColumn("product_id", fake_product_id())
  products_df = products_df.withColumn("product_name", fake_product_name())
  products_df = products_df.withColumn("category", fake_category())
  products_df = products_df.withColumn("price", fake_price())
  products_df = products_df.withColumn("stock", fake_stock())
  products_df = products_df.withColumn("description", fake_description())

  products_df.select("product_id", "product_name", "category", "price", "stock", "description") \
    .repartition(1) \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save(volume_folder + "/products")

