In [0]:
%pip install dbldatagen

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

row_count = 1000

# Base: start with a numeric ID column
df = spark.range(row_count)  # creates column 'id' from 0..row_count-1

# Define arrays for provider choices
card_providers = F.array(F.lit("visa"), F.lit("mastercard"), F.lit("american express"))
upi_providers = F.array(F.lit("paytm"), F.lit("phonepe"))

letters = F.array(*[F.lit(c) for c in list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")])

df = (
    df
    # Surrogate key (1..row_count)
    .withColumn("surrogate_key", (F.col("id") + F.lit(1)).cast("long"))
    # Deterministic letter based on id % 26
    .withColumn("letter_idx", (F.col("id") % F.size(letters)).cast("int"))
    .withColumn("letter", letters[F.col("letter_idx")])
    # Compose payment_id: PID-xxxxX-int  => PID-<4-digit id><Letter>-<id>
    .withColumn(
        "payment_id",
        F.concat(
            F.lit("PID-"),
            F.lpad(F.col("id").cast("string"), 4, "0"),
            F.col("letter"),
            F.lit("-"),
            F.col("id").cast("string")
        )
    )
)

df = (
    df.withColumn("rnd_method", F.rand())
    .withColumn(
        "method",
        F.when(F.col("rnd_method") < 1 / 3.0, F.lit("card"))
        .when(F.col("rnd_method") < 2 / 3.0, F.lit("cash"))
        .otherwise(F.lit("UPI")),
    )

    .withColumn("rnd_status", F.rand())
    .withColumn(
        "status",
        F.when(F.col("rnd_status") < 0.7, F.lit("Success"))
        .when(F.col("rnd_status") < 0.9, F.lit("Failed"))
        .otherwise(F.lit("Pending")),
    )
    
    .withColumn("rnd_provider", F.rand())
    .withColumn(
        "provider",
        F.when(
            F.col("method") == "card",
            card_providers[
                F.floor(F.col("rnd_provider") * F.size(card_providers)).cast("int")
            ],
        )
        .when(
            F.col("method") == "UPI",
            upi_providers[
                F.floor(F.col("rnd_provider") * F.size(upi_providers)).cast("int")
            ],
        )
        .otherwise(F.lit("Manual")),
    )
    .drop("rnd_method", "rnd_status", "rnd_provider","id","letter_idx", "letter")
)



In [0]:
spark.conf.set("spark.sql.ansi.enabled", "false")

In [0]:
df.write.mode('append').saveAsTable('dev.bronze.dim_payments')