In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("HomeListings").getOrCreate()

# Create the DataFrame
data = [
    (1, "Home A", "New Listing"),
    (2, "Home A", "Pending"),
    (3, "Home A", "Relisted"),
    (4, "Home B", "New Listing"),
    (5, "Home B", "Under Contract"),
    (6, "Home B", "Relisted"),
    (7, "Home C", "New Listing"),
    (8, "Home C", "Under Contract"),
    (9, "Home C", "Closed"),
]

columns = ["ListingID", "HomeID", "Status"]
df = spark.createDataFrame(data, columns)

# Define the window specification
windowSpec = Window.orderBy("ListingID").rowsBetween(
    Window.unboundedPreceding, Window.currentRow
)

# Add the GroupID column
df_with_groupid = df.withColumn(
    "GroupID",
    F.sum(F.when(F.col("Status").isin("New Listing", "Relisted"), 1).otherwise(0)).over(
        windowSpec
    ),
)

# Show the result
df_with_groupid.display()

ListingID,HomeID,Status,GroupID
1,Home A,New Listing,1
2,Home A,Pending,1
3,Home A,Relisted,2
4,Home B,New Listing,3
5,Home B,Under Contract,3
6,Home B,Relisted,4
7,Home C,New Listing,5
8,Home C,Under Contract,5
9,Home C,Closed,5


In [0]:
%sql
SELECT
  *,
  SUM(
    CASE
      WHEN Status IN ('New Listing', 'Relisted') THEN 1
      ELSE 0
    END
  ) OVER(
    ORDER BY
      ListingID
  ) AS GroupID
FROM
  HomeListings;

ListingID,HomeID,Status,GroupID
1,Home A,New Listing,1
2,Home A,Pending,1
3,Home A,Relisted,2
4,Home B,New Listing,3
5,Home B,Under Contract,3
6,Home B,Relisted,4
7,Home C,New Listing,5
8,Home C,Under Contract,5
9,Home C,Closed,5
