In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, schema_of_json
from pyspark.sql.types import StructType, StringType

# Initialize Spark session
spark = SparkSession.builder.appName("FHIR Patient Data Loader").getOrCreate()

# Set the path to the patient Parquet file
patient_parquet_path = "/home/jovyan/work/fhir-data/fhir-parquet/patients"

# Load the patient data
df_patients = spark.read.parquet(patient_parquet_path)

# Print the schema
print("Schema of the patient data:")
df_patients.printSchema()

# Show the first 3 rows of the raw data
print("\nFirst 3 rows of raw patient data:")
df_patients.show(3, truncate=False)

# If the 'resource' column exists and is a string, we'll try to parse it
if 'resource' in df_patients.columns and isinstance(df_patients.schema['resource'].dataType, StringType):
    # Infer the schema from the JSON data
    sample_json = df_patients.select("resource").first()[0]
    json_schema = schema_of_json(sample_json)

    # Parse the JSON data
    df_patients_parsed = df_patients.withColumn("parsed_resource", from_json(col("resource"), json_schema))

    # Select all fields from the parsed JSON
    df_patients_parsed = df_patients_parsed.select("parsed_resource.*")

    # Print the new schema
    print("\nSchema of the parsed patient data:")
    df_patients_parsed.printSchema()

    # Show the first 3 rows of the parsed data
    print("\nFirst 3 rows of parsed patient data:")
    df_patients_parsed.show(3, truncate=False)
else:
    print("\nThe 'resource' column is not a string or doesn't exist. Showing the data as is.")
    df_patients.show(3, truncate=False)

# Stop the Spark session
spark.stop()

Schema of the patient data:
root
 |-- resource: string (nullable = true)


First 3 rows of raw patient data:
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------