In [0]:
%pip install faker

In [0]:
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import StringType
from faker import Faker
import random, json

fake = Faker()

products = [
    {"product_name": "MOTI CHOOR LADDU", "hsn_code": "17049020", "rate": 760},
    {"product_name": "KOVA BADUSHA", "hsn_code": "17049028", "rate": 880},
    {"product_name": "RASGULLA", "hsn_code": "17049029", "rate": 720},
    {"product_name": "KAJU KATLI", "hsn_code": "17049030", "rate": 950},
    {"product_name": "GULAB JAMUN", "hsn_code": "17049031", "rate": 680},
    {"product_name": "MILK CAKE", "hsn_code": "17049032", "rate": 820},
    {"product_name": "BESAN LADDU", "hsn_code": "17049033", "rate": 640},
    {"product_name": "MYSORE PAK", "hsn_code": "17049034", "rate": 850},
    {"product_name": "JANGRI", "hsn_code": "17049035", "rate": 620},
    {"product_name": "DRY FRUIT BURFI", "hsn_code": "17049036", "rate": 1050},
    {"product_name": "KALAKAND", "hsn_code": "17049037", "rate": 780},
    {"product_name": "BOONDI LADDU", "hsn_code": "17049038", "rate": 690},
    {"product_name": "MALAI SANDWICH", "hsn_code": "17049039", "rate": 970},
    {"product_name": "BADAM HALWA", "hsn_code": "17049040", "rate": 1100},
    {"product_name": "PISTA ROLL", "hsn_code": "17049041", "rate": 980},
    {"product_name": "SOAN PAPDI", "hsn_code": "17049042", "rate": 560},
    {"product_name": "CHOCOLATE BURFI", "hsn_code": "17049043", "rate": 900},
    {"product_name": "CASHEW LADDU", "hsn_code": "17049044", "rate": 960},
    {"product_name": "ANJEER ROLL", "hsn_code": "17049045", "rate": 1020},
    {"product_name": "MOTICHOOR MODAK", "hsn_code": "17049046", "rate": 880}
]

payment_methods = ["UPI", "CASH", "CARD", "NETBANKING"]
store_codes = [f"ST{i:03}" for i in range(1, 6)]
cashier_codes = [f"CASH{i:03}" for i in range(1, 10)]
counter_names = [f"COUNTER{i}" for i in range(1, 4)]

def generate_bill_json(_):
    bill = {
        "bill_no": str(fake.random_int(min=10000, max=99999)),
        "store_code": random.choice(store_codes),
        "bill_datetime": fake.date_time_this_year().isoformat(),
        "cashier_code": random.choice(cashier_codes),
        "counter_name": random.choice(counter_names),
        "payment_method": random.choice(payment_methods)
    }
    
    items = []
    total = 0
    for _ in range(random.randint(1, 5)):
        prod = random.choice(products)
        qty = round(random.uniform(0.25, 2.0), 2)
        amount = round(prod["rate"] * qty, 2)
        tax_percent = 5.0
        tax_amt = round(amount * tax_percent / 100, 2)
        total += amount + tax_amt
        items.append({
            "product_name": prod["product_name"],
            "hsn_code": prod["hsn_code"],
            "quantity": qty,
            "rate": prod["rate"],
            "amount": amount,
            "tax_percent": tax_percent,
            "tax_amount": tax_amt
        })
    
    bill["items"] = items
    bill["total_amount"] = round(total, 2)
    
    return json.dumps(bill)

generate_bill_udf = udf(generate_bill_json, StringType())

n_records = 1_000_000
df = spark.range(0, n_records).withColumn("json_payload", generate_bill_udf("id"))

# output_path = "/dbfs/FileStore/bronze/sales_raw/"
output_path = "/Volumes/oliv_mitai_uc/bronze/olive_mitai_raw_data/"
df.select("json_payload").write.mode("overwrite").text(output_path)

print(f"Successfully generated {n_records} JSON records to {output_path}")


In [0]:
display(dbutils.fs.ls("/Volumes/oliv_mitai_uc/bronze/olive_mitai_raw_data/"))

In [0]:
df_raw = spark.read.option("multiLine", True).json("/Volumes/oliv_mitai_uc/bronze/olive_mitai_raw_data/")
df_raw.printSchema()

In [0]:
display(df_raw)

In [0]:
# from pyspark.sql.functions import current_timestamp, input_file_name
# df_bronze = (
#     df_raw
#     .withColumn("ingest_ts", current_timestamp())
#     .withColumn("source_file", input_file_name())
# )

# The command(s): input_file_name are not supported in Unity Catalog. Please use _metadata.file_path instead. SQLSTATE: 0AKUC
from pyspark.sql.functions import current_timestamp, input_file_name
df_bronze =  df_raw.withColumn("ingest_ts", current_timestamp())

In [0]:
df_bronze.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("oliv_mitai_uc.bronze.olive_mitai_sales")


In [0]:
%sql
select * from oliv_mitai_uc.bronze.olive_mitai_sales;