In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import StructType, StructField, StringType

# Initialize Spark session
spark = SparkSession.builder.appName("example").getOrCreate()

# Define the schema
schema = StructType([
    StructField("Name", StringType(), True)
])

# Example data
data = [("John",), ("Alice",), ("Bob",)]

# Create DataFrame with the specified schema
df = spark.createDataFrame(data, schema)

# Add an identity column
df = df.withColumn("ID", monotonically_increasing_id())

# Show the DataFrame
df.display()


Name,ID
John,17179869184
Alice,42949672960
Bob,60129542144


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# Initialize Spark session
spark = SparkSession.builder.appName("example").getOrCreate()

# Example data
data = [("John",), ("Alice",), ("Bob",)]

# Define schema and create DataFrame
schema = ["Name"]
df = spark.createDataFrame(data, schema)

# Define a window specification
window_spec = Window.orderBy("Name")

# Add a surrogate key with consecutive values using row_number()
df = df.withColumn("ID", row_number().over(window_spec))

# Show the DataFrame
df.display()


Name,ID
Alice,1
Bob,2
John,3


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, monotonically_increasing_id

# Initialize Spark session
spark = SparkSession.builder.appName("example").getOrCreate()

# Example data
data = [("John",), ("Alice",), ("Bob",)]

# Define schema and create DataFrame
schema = ["Name"]
df = spark.createDataFrame(data, schema)

# Add a temporary column with monotonically increasing id
df = df.withColumn("temp_id", monotonically_increasing_id())

# Define a window specification
window_spec = Window.orderBy("temp_id")

# Add a surrogate key with consecutive values using row_number()
df = df.withColumn("ID", row_number().over(window_spec)).drop("temp_id")

# Show the DataFrame
df.display()


Name,ID
John,1
Alice,2
Bob,3
