In [0]:
%pip install faker

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from faker import Faker
import random, json

spark = SparkSession.builder.appName("GenerateBronzeData").getOrCreate()
fake = Faker()

# -----------------------
# Store metadata
# -----------------------
stores = [
    {
        "store_code": f"ST{i:03}",
        "store_name": f"Olive Mithai Outlet {i}",
        "address_line1": fake.street_address(),
        "city": fake.city(),
        "state": fake.state(),
        "postal_code": fake.postcode(),
        "gstin": fake.bothify(text="??#########?Z#"),
        "phone": f"+91{random.randint(6000000000, 9999999999)}",  # Valid Indian mobile
        "fssai_no": fake.bothify(text="###########"),
    }
    for i in range(1, 6)
]

# -----------------------
# Cashiers, Counters, Payment
# -----------------------
cashiers = [
    {"cashier_code": f"CASH{i:03}", "cashier_name": fake.name()}
    for i in range(1, 11)
]

counters = [f"COUNTER{i}" for i in range(1, 4)]

# Only valid payment methods
payment_methods = ["UPI", "CASH", "CARD"]

# -----------------------
# Product master with extra fields
# -----------------------
products = [
    {"sku": "SKU001", "product_name": "MOTI CHOOR LADDU", "category": "SWEETS", "uom": "KG", "hsn_code": "17049020", "rate": 760},
    {"sku": "SKU002", "product_name": "KOVA BADUSHA", "category": "SWEETS", "uom": "KG", "hsn_code": "17049028", "rate": 880},
    {"sku": "SKU003", "product_name": "RASGULLA", "category": "SWEETS", "uom": "TIN", "hsn_code": "17049029", "rate": 720},
    {"sku": "SKU004", "product_name": "KAJU KATLI", "category": "SWEETS", "uom": "KG", "hsn_code": "17049030", "rate": 950},
    {"sku": "SKU005", "product_name": "GULAB JAMUN", "category": "SWEETS", "uom": "TIN", "hsn_code": "17049031", "rate": 680},
    {"sku": "SKU006", "product_name": "MILK CAKE", "category": "SWEETS", "uom": "KG", "hsn_code": "17049032", "rate": 820},
    {"sku": "SKU007", "product_name": "BESAN LADDU", "category": "SWEETS", "uom": "KG", "hsn_code": "17049033", "rate": 640},
    {"sku": "SKU008", "product_name": "MYSORE PAK", "category": "SWEETS", "uom": "KG", "hsn_code": "17049034", "rate": 850},
    {"sku": "SKU009", "product_name": "JANGRI", "category": "SWEETS", "uom": "KG", "hsn_code": "17049035", "rate": 620},
    {"sku": "SKU010", "product_name": "DRY FRUIT BURFI", "category": "SWEETS", "uom": "KG", "hsn_code": "17049036", "rate": 1050},
]

# -----------------------
# Function to generate a single bill JSON
# -----------------------
def generate_bill_json(_):
    store = random.choice(stores)
    cashier = random.choice(cashiers)
    counter = random.choice(counters)
    payment = random.choice(payment_methods)

    bill = {
        "bill_no": str(fake.random_int(min=10000, max=99999)),
        "bill_datetime": fake.date_time_this_year().isoformat(),
        "store_code": store["store_code"],
        "store_name": store["store_name"],
        "address_line1": store["address_line1"],
        "city": store["city"],
        "state": store["state"],
        "postal_code": store["postal_code"],
        "gstin": store["gstin"],
        "phone": store["phone"],
        "fssai_no": store["fssai_no"],
        "cashier_code": cashier["cashier_code"],
        "cashier_name": cashier["cashier_name"],
        "counter_name": counter,
        "payment_method": payment,
    }

    items = []
    total = 0
    for _ in range(random.randint(1, 5)):
        prod = random.choice(products)
        qty = round(random.uniform(0.25, 2.0), 2)
        amount = round(prod["rate"] * qty, 2)
        tax_percent = 5.0
        tax_amt = round(amount * tax_percent / 100, 2)
        total += amount + tax_amt

        items.append({
            "sku": prod["sku"],
            "product_name": prod["product_name"],
            "category": prod["category"],
            "uom": prod["uom"],
            "hsn_code": prod["hsn_code"],
            "quantity": qty,
            "rate": prod["rate"],
            "amount": amount,
            "tax_percent": tax_percent,
            "tax_amount": tax_amt
        })

    bill["items"] = items
    bill["total_amount"] = round(total, 2)

    return json.dumps(bill)

# -----------------------
# Register as Spark UDF
# -----------------------
generate_bill_udf = udf(generate_bill_json, StringType())

# -----------------------
# Generate N records
# -----------------------
n_records = 100000
df = spark.range(0, n_records).withColumn("json_payload", generate_bill_udf("id"))

output_path = "/Volumes/oliv_mitai_uc/bronze/olive_mitai_raw_data/"
df.select("json_payload").write.mode("overwrite").json(output_path)

print(f"Generated {n_records} JSON records â†’ {output_path}")


In [0]:
display(dbutils.fs.ls("/Volumes/oliv_mitai_uc/bronze/olive_mitai_raw_data/"))

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType
schema = StructType([
    StructField("bill_no", StringType(), True),
    StructField("bill_datetime", StringType(), True),
    StructField("store_code", StringType(), True),
    StructField("store_name", StringType(), True),
    StructField("address_line1", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("gstin", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("fssai_no", StringType(), True),
    StructField("cashier_code", StringType(), True),
    StructField("cashier_name", StringType(), True),
    StructField("counter_name", StringType(), True),
    StructField("payment_method", StringType(), True),
    StructField("items", ArrayType(StructType([
        StructField("sku", StringType(), True),
        StructField("product_name", StringType(), True),
        StructField("category", StringType(), True),
        StructField("uom", StringType(), True),
        StructField("hsn_code", StringType(), True),
        StructField("quantity", DoubleType(), True),
        StructField("rate", DoubleType(), True),
        StructField("amount", DoubleType(), True),
        StructField("tax_percent", DoubleType(), True),
        StructField("tax_amount", DoubleType(), True)
    ])), True),
    StructField("total_amount", DoubleType(), True)
])


In [0]:
from pyspark.sql.functions import col, from_json, to_date, year, month
df_raw = spark.read.option("multiLine", True).json("/Volumes/oliv_mitai_uc/bronze/olive_mitai_raw_data/")
df_parsed = df_raw.withColumn("data", from_json(col("json_payload"), schema)).select("data.*")
df_parsed.printSchema()

In [0]:
display(df_parsed)

In [0]:
# from pyspark.sql.functions import current_timestamp, input_file_name
# df_bronze = (
#     df_raw
#     .withColumn("ingest_ts", current_timestamp())
#     .withColumn("source_file", input_file_name())
# )

# The command(s): input_file_name are not supported in Unity Catalog. Please use _metadata.file_path instead. SQLSTATE: 0AKUC
from pyspark.sql.functions import current_timestamp, input_file_name, to_date, col
df_bronze =  df_parsed.withColumn("bill_date", to_date(col("bill_datetime"))).withColumn("ingest_ts", current_timestamp())

In [0]:
df_bronze.write.format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .partitionBy("bill_date") \
    .saveAsTable("oliv_mitai_uc.bronze.olive_mitai_sales")


In [0]:
%sql
select * from oliv_mitai_uc.bronze.olive_mitai_sales;