In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, when, regexp_extract, from_json
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

# Initialize Spark session
spark = SparkSession.builder.appName("DiabetesPrediction").getOrCreate()

# Define the root directories for input and output
input_root_directory = "/home/jovyan/work/fhir-data/fhir-parquet"
analytics_root_directory = "/home/jovyan/work/fhir-data/analytics"

# Define schema for patients
patient_schema = StructType([
    StructField("id", StringType()),
    StructField("name", ArrayType(StructType([
        StructField("given", ArrayType(StringType())),
        StructField("family", StringType())
    ]))),
    StructField("birthDate", StringType())
])

# Define schema for conditions
condition_schema = StructType([
    StructField("subject", StructType([StructField("reference", StringType())])),
    StructField("code", StructType([
        StructField("coding", ArrayType(StructType([
            StructField("system", StringType()),
            StructField("code", StringType()),
            StructField("display", StringType())
        ])))
    ]))
])

# Load and parse patients data
patients_df = spark.read.parquet(f"{input_root_directory}/patients")
patients_df = patients_df.select(from_json(col("resource"), patient_schema).alias("patient_data"))
patients_df = patients_df.select(
    col("patient_data.id").alias("id"),
    col("patient_data.name").getItem(0).getField("family").alias("family_name"),
    col("patient_data.name").getItem(0).getField("given").getItem(0).alias("given_name"),
    col("patient_data.birthDate").alias("birthDate")
)

# Load and parse conditions data
conditions_df = spark.read.parquet(f"{input_root_directory}/conditions")
conditions_df = conditions_df.select(from_json(col("resource"), condition_schema).alias("condition_data"))
conditions_df = conditions_df.select(
    col("condition_data.subject.reference").alias("patient_reference"),
    col("condition_data.code.coding").alias("coding")
)

# Clean up the data
patients_df = patients_df.na.drop(subset=["id"])  # Remove rows with null IDs
conditions_df = conditions_df.na.drop(subset=["patient_reference", "coding"])  # Remove rows with null references or codings

# Define diabetes-related condition codes (SNOMED CT codes)
diabetes_codes = ["44054006", "46635009", "73211009"]  # Type 2 Diabetes, Type 1 Diabetes, Diabetes mellitus

# Explode the coding array and filter for diabetes conditions
diabetes_conditions = conditions_df.select(
    col("patient_reference"),
    explode("coding").alias("coding")
).filter(col("coding.code").isin(diabetes_codes))

# Extract UUID from the patient_reference URN
diabetes_conditions = diabetes_conditions.withColumn(
    "patient_id",
    regexp_extract(col("patient_reference"), r"urn:uuid:([\w-]+)", 1)
)

# Join patients with diabetes conditions
patients_with_diabetes = patients_df.join(
    diabetes_conditions,
    patients_df.id == diabetes_conditions.patient_id,
    "inner"
)

# Identify patients without diabetes using left anti join
patients_without_diabetes = patients_df.join(
    diabetes_conditions,
    patients_df.id == diabetes_conditions.patient_id,
    "left_anti"
)

# Show the results for patients with diabetes
print("Patients predicted to have diabetes:")
patients_with_diabetes.select("id", "family_name", "given_name", "birthDate").distinct().show(50, truncate=False)

# Show the results for patients without diabetes
print("Patients predicted to not have diabetes:")
patients_without_diabetes.select("id", "family_name", "given_name", "birthDate").distinct().show(50, truncate=False)

# Save the results for patients with diabetes
patients_with_diabetes.select("id", "family_name", "given_name", "birthDate").distinct().write.mode("overwrite").parquet(f"{analytics_root_directory}/diabetes_patients")

# Save the results for patients without diabetes
patients_without_diabetes.select("id", "family_name", "given_name", "birthDate").distinct().write.mode("overwrite").parquet(f"{analytics_root_directory}/non_diabetes_patients")

# Stop the Spark session
spark.stop()

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/home/jovyan/work/fhir-data/fhir-parquet/conditions.