In [1]:
from pyspark.sql import SparkSession
import datetime as dt
import pyspark.sql.functions as F
spark = SparkSession.builder.appName('etl').getOrCreate()
df = spark.read.parquet('data/catalog.parquet')

## Measurement-mapping

TODO:

[X] Rename Columns  
[X] Split Datetime  
[X] Coalesce value as nums  
[X] Combine blood pressure components  
[ ] Check all required fields mapped

In [2]:
filtered = df.filter(df['resourceType'] == 'Observation')

In [110]:
Measurement = filtered.filter(filtered.valueCodeableConcept.isNull())
Observation = filtered.filter(filtered.valueCodeableConcept.isNotNull())
Measurement = Measurement.select(['id',
                            'subject',
                            'code',
                            'performer',
                            'encounter',
                            'meta',
                            'category',      
                            'valueQuantity',
                            'effectiveDateTime',
                            'Extension.valueCodeableConcept',
                            'component'])


split_dates = F.split(Measurement["effectiveDateTime"], 'T')
val_as_num = F.coalesce(F.col("value_as_number.double"), F.col("value_as_number.long"))

Measurement = Measurement.withColumnRenamed("id", "measurement_id")\
                         .withColumn("measurement_date", split_dates.getItem(0))\
                         .withColumn("person_id", Measurement.subject.reference)\
                         .drop("subject")\
                         .withColumnRenamed("code", "measurement_concept_id")\
                         .withColumnRenamed("effectiveDateTime", "measurement_datetime")\
                         .drop("valueCodeableConcept")\
                         .withColumn("measurement_type_concept_id", Measurement.category.getItem(0).coding.code.getItem(0))\
                         .withColumn("value_as_number", Measurement.valueQuantity.value)\
                         .withColumn("visit_occurrence_id", Measurement.encounter.reference)\
                         .drop("encounter")\
                         .withColumn("value_as_number", val_as_num)\
                         .withColumn("unit_source_value", Measurement.valueQuantity.unit)\
                         .withColumnRenamed("performer", "provider_id")\
                         .drop("valueQuantity")\
                         .drop("meta")



Measurement = Measurement.withColumn("measurement_concept_id", Measurement.measurement_concept_id.coding.getItem(0).code)\
                       
Measurement.printSchema()
Measurement = Measurement.withColumn("distolic", Measurement.component.getItem(0).valueQuantity.value)
Measurement = Measurement.withColumn("systolic", Measurement.component.getItem(1).valueQuantity.value)
Measurement = Measurement.withColumn("value_as_num_combine", F.when(F.col("distolic") >0 ,F.array("systolic", "distolic")))\
                         .withColumn("value_as_number", F.array(Measurement.value_as_number))

Measurement = Measurement.withColumn("value_as_number", F.coalesce(Measurement.value_as_num_combine, Measurement.value_as_number))\
                         .drop("distolic", "systolic", "value_as_num_combine", "component","category")
        

#Measurement.toPandas().to_csv("measurememts.csv", header=True)

Measurement.show(1)

root
 |-- measurement_id: string (nullable = true)
 |-- measurement_concept_id: string (nullable = true)
 |-- provider_id: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- reference: string (nullable = true)
 |    |    |-- display: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- coding: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- system: string (nullable = true)
 |    |    |    |    |-- code: string (nullable = true)
 |    |    |    |    |-- display: string (nullable = true)
 |    |    |-- text: string (nullable = true)
 |-- measurement_datetime: string (nullable = true)
 |-- component: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- code: struct (nullable = true)
 |    |    |    |-- coding: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull =