In [63]:
from pyspark.sql import SparkSession
import datetime as dt
import pyspark.sql.functions as F
spark = SparkSession.builder.appName('etl').getOrCreate()
df = spark.read.parquet('data/catalog.parquet')

filtered = df.filter(df['resourceType'] == 'Observation')
Observation = filtered.filter(filtered.valueCodeableConcept.isNotNull())

Observation = Observation.select(['id',
                            'subject',
                            'code',
                            'performer',
                            'encounter',
                            'meta',
                            'effectiveDateTime',
                            'valueCodeableConcept',
                            'category'])

split_dates = F.split(Observation["effectiveDateTime"], 'T')

Observation = Observation.withColumnRenamed("id", "observation_id")\
                         .withColumn("observation_type_concept_id.coding", Observation.category.coding)\
                         .withColumn("observation_date", split_dates.getItem(0))\
                         .withColumn("person_id", Observation.subject.reference)\
                         .withColumn("value_as_string", Observation.valueCodeableConcept.text)\
                         .withColumnRenamed("code", "observation_concept_id")\
                         .withColumnRenamed("effectiveDateTime", "measurement_datetime")\
                         .drop("valueCodeableConcept")\
                         .withColumn("visit_occurrence_id", Observation.encounter.reference)\
                         .withColumnRenamed("performer", "provider_id")\
                         .drop("encounter")\
                         .drop("subject")\
                         .drop("meta")
Observation.printSchema()
Observation.toPandas().to_csv("obs.csv", header=True)

root
 |-- observation_id: string (nullable = true)
 |-- observation_concept_id: struct (nullable = true)
 |    |-- coding: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- system: string (nullable = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- display: string (nullable = true)
 |    |-- text: string (nullable = true)
 |-- provider_id: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- reference: string (nullable = true)
 |    |    |-- display: string (nullable = true)
 |-- measurement_datetime: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- coding: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- system: string (nullable = true)
 |    |    |    |    |-- code: string (nullable = true)
 |    |    |    |    |-- display: string (nullable = 