In [25]:
from pyspark.sql import SparkSession
import datetime as dt
import pyspark.sql.functions as F
spark = SparkSession.builder.appName('etl').getOrCreate()
df = spark.read.parquet('data/catalog.parquet')

filtered = df.filter(df['resourceType'] == 'Observation')
Observation = filtered.filter(filtered.valueCodeableConcept.isNotNull())

Observation = Observation.select(['id',
                            'subject',
                            'code',
                            'performer',
                            'encounter',
                            'meta',
                            'effectiveDateTime',
                            'valueCodeableConcept',
                            'category'])

Observation.printSchema()

split_dates = F.split(Observation["effectiveDateTime"], 'T')

Observation = Observation.withColumnRenamed("id", "observation_id")\
                         .withColumn("observation_type_concept_id", Observation.category.coding.getItem(0).code.getItem(0))\
                         .withColumn("observation_date", split_dates.getItem(0))\
                         .withColumn("person_id", Observation.subject.reference)\
                         .withColumn("value_as_string", Observation.valueCodeableConcept.text)\
                         .withColumnRenamed("code", "observation_concept_id")\
                         .withColumnRenamed("effectiveDateTime", "measurement_datetime")\
                         .drop("valueCodeableConcept")\
                         .withColumn("visit_occurrence_id", Observation.encounter.reference)\
                         .withColumnRenamed("performer", "provider_id")\
                         .drop("encounter")\
                         .drop("subject")\
                         .drop("meta")\
                         .drop("category")


Observation = Observation.withColumn("observation_concept_id", Observation.observation_concept_id.coding.getItem(0).code)

Observation.toPandas().to_csv("obs.csv", header=True)
Observation.show(5)

root
 |-- id: string (nullable = true)
 |-- subject: struct (nullable = true)
 |    |-- reference: string (nullable = true)
 |    |-- display: string (nullable = true)
 |-- code: struct (nullable = true)
 |    |-- coding: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- system: string (nullable = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- display: string (nullable = true)
 |    |-- text: string (nullable = true)
 |-- performer: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- reference: string (nullable = true)
 |    |    |-- display: string (nullable = true)
 |-- encounter: struct (nullable = true)
 |    |-- reference: string (nullable = true)
 |-- meta: struct (nullable = true)
 |    |-- lastUpdated: string (nullable = true)
 |    |-- versionId: string (nullable = true)
 |-- effectiveDateTime: string (nullable = true)
 |-- valueCodeableConcept: struct (nullable = tr