In [0]:
from pyspark.sql.types import *
from datetime import datetime

# Initialize Spark session

# Define the schema
schema = StructType([
    StructField("Center_ID", StringType(), True),
    StructField("Case_ID", IntegerType(), True),
    StructField("Stage1", DateType(), True),
    StructField("Stage2", DateType(), True),
    StructField("Stage3", DateType(), True),
    StructField("Stage4", DateType(), True),
    StructField("Stage5", DateType(), True),
])

# Define the data
data = [
    ("C1", 1, datetime.strptime("2024-01-01", "%Y-%m-%d"), None, None, datetime.strptime("2024-01-13", "%Y-%m-%d"), None),
    ("C1", 2, datetime.strptime("2024-01-05", "%Y-%m-%d"), datetime.strptime("2024-01-10", "%Y-%m-%d"), None, None, None),
    ("C2", 3, None, datetime.strptime("2024-01-10", "%Y-%m-%d"), None, None, datetime.strptime("2024-01-20", "%Y-%m-%d")),
    ("C3", 4, datetime.strptime("2024-01-05", "%Y-%m-%d"), datetime.strptime("2024-01-12", "%Y-%m-%d"), datetime.strptime("2024-01-12", "%Y-%m-%d"), datetime.strptime("2024-01-14", "%Y-%m-%d"), datetime.strptime("2024-01-20", "%Y-%m-%d")),
    ("C3", 5, datetime.strptime("2024-01-10", "%Y-%m-%d"), datetime.strptime("2024-01-15", "%Y-%m-%d"), None, None, None),
    ("C3", 6, None, None, None, datetime.strptime("2024-01-15", "%Y-%m-%d"), None),
]

# Create the DataFrame
case_progress_df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
case_progress_df.display()


Center_ID,Case_ID,Stage1,Stage2,Stage3,Stage4,Stage5
C1,1,2024-01-01,,,2024-01-13,
C1,2,2024-01-05,2024-01-10,,,
C2,3,,2024-01-10,,,2024-01-20
C3,4,2024-01-05,2024-01-12,2024-01-12,2024-01-14,2024-01-20
C3,5,2024-01-10,2024-01-15,,,
C3,6,,,,2024-01-15,


In [0]:
# Create a temporary view for SQL
case_progress_df.createOrReplaceTempView("CaseProgress")

In [0]:
# Spark SQL query
query = """
WITH cte AS (
    SELECT *,
           concat_ws(",", Stage1, Stage2, Stage3, Stage4, Stage5) AS cnct1,
           concat_ws(",", Stage2, Stage3, Stage4, Stage5) AS cnct2,
           concat_ws(",", Stage3, Stage4, Stage5) AS cnct3,
           concat_ws(",", Stage4, Stage5) AS cnct4,
           concat_ws(",", Stage5) AS cnct5
    FROM CaseProgress
)
SELECT 
    Center_ID,
    SUM(IF(LENGTH(cnct1) <> 0, 1, 0)) AS Stage1,
    SUM(IF(LENGTH(cnct2) <> 0, 1, 0)) AS Stage2,
    SUM(IF(LENGTH(cnct3) <> 0, 1, 0)) AS Stage3,
    SUM(IF(LENGTH(cnct4) <> 0, 1, 0)) AS Stage4,
    SUM(IF(LENGTH(cnct5) <> 0, 1, 0)) AS Stage5
FROM cte
GROUP BY Center_ID
"""

# Execute the query
result_df = spark.sql(query)

# Show the result
result_df.display()


Center_ID,Stage1,Stage2,Stage3,Stage4,Stage5
C1,2,2,1,1,0
C2,1,1,1,1,1
C3,3,3,2,2,1
