In [0]:
from pyspark.sql import functions as F
import random
import time
from datetime import datetime

# ----------------------------
# Widgets
# ----------------------------
dbutils.widgets.text("time_interval", "5")     # seconds
dbutils.widgets.text("min_records", "10")
dbutils.widgets.text("max_records", "100")

time_interval = int(dbutils.widgets.get("time_interval"))
min_records  = int(dbutils.widgets.get("min_records"))
max_records  = int(dbutils.widgets.get("max_records"))

In [0]:
TARGET_DIR = "/Volumes/otc/volumn/landingfiles/streaming_data/generated_data"
TMP_DIR_BASE = f"{TARGET_DIR}/_tmp_single_part_write"

dbutils.fs.mkdirs(TARGET_DIR)
dbutils.fs.mkdirs(TMP_DIR_BASE)

def write_single_csv(df, target_dir, file_name):
    tmp_dir = f"{TMP_DIR_BASE}/{file_name[:-4]}"
    dbutils.fs.rm(tmp_dir, recurse=True)

    (
        df.coalesce(1)
          .write
          .mode("overwrite")
          .option("header", "true")
          .csv(tmp_dir)
    )

    part_file = [f.path for f in dbutils.fs.ls(tmp_dir) if f.name.startswith("part-")][0]
    dbutils.fs.mv(part_file, f"{target_dir}/{file_name}", True)
    dbutils.fs.rm(tmp_dir, recurse=True)

while True:
    n = random.randint(min_records, max_records)
    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")
    file_name = f"batch_{ts}.csv"

    base_id = int(time.time() * 1000)

    salary_base = random.randint(30000, 120000)
    salary_step = random.randint(10, 200)

    df = (
        spark.range(n)
             .withColumn("emp_id", F.lit(base_id) + F.col("id"))        # BIGINT (safe)
             .withColumn("emp_code",                                   # STRING (derived)
                         F.concat(F.lit("E-"), F.lpad(F.col("emp_id"), 8, "0")))
             .withColumn("name",
                         F.concat(F.lit("Emp_"), F.lpad(F.col("id"), 5, "0")))
             .withColumn("salary",
                         (F.lit(salary_base) + F.col("id") * salary_step).cast("int"))
             .select("emp_id", "emp_code", "name", "salary")
    )

    write_single_csv(df, TARGET_DIR, file_name)
    time.sleep(time_interval)
