In [0]:
pip install --upgrade dbldatagen

In [0]:
# ---------------------------------------------
# CONFIG
# ---------------------------------------------
spark.sql("USE CATALOG dev")
db_name = "dev.bronze"
table_name = "dim_customer"
full_table_name = f"{db_name}.{table_name}"

# ---------------------------------------------
# Imports
# ---------------------------------------------
import dbldatagen as dg
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window

# ---------------------------------------------
# Read current max surrogate key (if table exists)
# ---------------------------------------------
exists = spark.catalog.tableExists(full_table_name)
if exists:
    max_sk_val = (
        spark.table(full_table_name)
        .agg(F.max("customer_sk").alias("max_sk"))
        .collect()[0]["max_sk"]
    )
    max_sk = int(max_sk_val) if max_sk_val is not None else 0
else:
    max_sk = 0
import dbldatagen as dg
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window

num_new_rows = 50
partitions = 8
email_domain = "example.com"

# City → State mapping
city_to_state = {
    "Hyderabad": "Telangana",
    "Mumbai": "Maharashtra",
    "Delhi": "Delhi",
    "Bangalore": "Karnataka",
    "Chennai": "Tamil Nadu",
    "Pune": "Maharashtra",
    "Visakhapatnam": "Andhra Pradesh",
    "Kolkata": "West Bengal",
    "Ahmedabad": "Gujarat",
    "Jaipur": "Rajasthan",
}

first_names = [
    "Anita",
    "Priya",
    "Sneha",
    "Neha",
    "Pooja",
    "Kavya",
    "Arjun",
    "Amit",
    "Rahul",
    "Vikram",
    "Sanjay",
    "Varun",
    "Kiran",
]
last_names = [
    "Sharma",
    "Singh",
    "Patel",
    "Gupta",
    "Iyer",
    "Reddy",
    "Khan",
    "Das",
    "Nair",
    "Joshi",
    "Chowdhury",
    "Gowda",
]
segments = ["Retail", "Wholesale", "VIP"]

# Build Spark SQL map() literal for city→state
map_items = []
for c, s in city_to_state.items():
    map_items.extend([f"'{c}'", f"'{s}'"])
map_expr = "map(" + ",".join(map_items) + ")"

# Datagen spec
dg_spec = (
    dg.DataGenerator(spark, rows=num_new_rows, partitions=partitions)
    .withIdOutput()
    .withColumn("customer_sk", "long", expr=f"id + {max_sk} + 1")
    .withColumn("customer_id_num", "int", minValue=1, maxValue=10000, random=False)
    .withColumn("first_name", "string", values=first_names, random=True)
    .withColumn("last_name", "string", values=last_names, random=True)
    .withColumn("city", "string", values=list(city_to_state.keys()), random=True)
    .withColumn("state", "string", expr=f"element_at({map_expr}, city)")
    .withColumn("country", "string", values=["India"], random=True)
    .withColumn("customer_segment", "string", values=segments, random=True)
    .withColumn("effective_from", "date", expr="current_date()")
    .withColumn("effective_to", "date", expr="'2099-12-31'")
)

df_new = dg_spec.build()

# Compose fields (prefix customer_id_num -> string "CUST-<num>")
df_new = (
    df_new.withColumn(
        "customer_id", F.concat(F.lit("CUST-"), F.col("customer_id_num").cast("string"))
    )
    .withColumn("Name", F.concat_ws(" ", F.col("first_name"), F.col("last_name")))
    .withColumn(
        "email",
        F.concat(
            F.lower(F.col("first_name")),
            F.lit("."),
            F.lower(F.col("last_name")),
            F.lit("@"),
            F.lit(email_domain),
        ),
    )
)

# is_current = latest effective_from per *customer_id* (SCD2 rule)
from pyspark.sql.window import Window

w = Window.partitionBy("customer_id").orderBy(F.col("effective_from").desc())
df_new = (
    df_new.withColumn("rn_desc", F.row_number().over(w))
    .withColumn("is_current", (F.col("rn_desc") == 1))
    .drop("rn_desc", "customer_id_num")
)

display(df_new)

In [0]:
spark.sql("use catalog dev")
df_new.write.format("delta").mode("append").saveAsTable("bronze.dim_customer")
