In [2]:
from pyspark.sql import SparkSession

# Enable Hive support for our session so we can save resources as Hive tables
spark = SparkSession.builder \
                    .config('hive.exec.dynamic.partition.mode', 'nonstrict') \
                    .enableHiveSupport() \
                    .getOrCreate()

In [3]:
from bunsen.stu3.bundles import load_from_directory, extract_entry, write_to_database

# Load and cache the bundles so we don't reload them every time. - Old fhir bundles from bunsen
#bundles = load_from_directory(spark, 'gs://bunsen/data/bundles').cache()
# New fhir bundles
bundles = load_from_directory(spark, 'gs://cluster-data/synthea-fhir-bundles/demo').cache()

In [4]:
patients = extract_entry(spark, bundles, 'patient')
encounters = extract_entry(spark, bundles, 'encounter')
#observations = extract_entry(spark, bundles, 'patient')
#conditions = extract_entry(spark, bundles, 'condition')

In [5]:
from pyspark.sql.functions import explode
from datetime import datetime, date

def age(birthdate):
    born = datetime.strptime(birthdate, "%Y-%m-%d")
    today = date.today()
    return str(today.year - born.year - ((today.month, today.day) < (born.month, born.day)))

pats = patients.select('id','gender', 'birthDate', 'address.city', 'address.state', 'address.country') 

#pats['birthDate'] = pats['birthDate'].apply(age)
patsDF = pats.limit(10).toPandas()
patsDF['age'] = patsDF['birthDate'].apply(age)
display(patsDF)


Unnamed: 0,id,gender,birthDate,city,state,country,age
0,urn:uuid:f5261c16-8844-4736-9692-70f0eb8e953c,female,2004-05-01,[Marion],[Massachusetts],[US],14
1,urn:uuid:9eb8222a-c8d2-426c-8e47-abb5659ead92,female,1987-11-24,[Boston],[Massachusetts],[US],31
2,urn:uuid:14e23ee9-2357-446f-b0fa-213c3badca54,male,2000-10-01,[Franklin Town],[Massachusetts],[US],18
3,urn:uuid:099f7d4b-0207-4963-aae2-7d23fa12add7,male,1963-12-20,[Marshfield],[Massachusetts],[US],55
4,urn:uuid:1a84d943-e507-4e93-867a-8ff49718badb,male,1940-12-26,[Woburn],[Massachusetts],[US],78
5,urn:uuid:8d7b69c0-fac6-411a-937f-23f71bcf9003,male,2001-12-13,[Braintree Town],[Massachusetts],[US],17
6,urn:uuid:08e0212f-43ef-460f-9ba2-6f5b765e25e7,male,1964-05-06,[Mashpee],[Massachusetts],[US],54
7,urn:uuid:10fc33cf-8b24-480d-bb2c-1bc8443b76f2,male,1964-05-06,[Norwell],[Massachusetts],[US],54
8,urn:uuid:71d9d84e-a55a-4612-9cee-aeea3d063bcf,female,1989-10-28,[Northampton],[Massachusetts],[US],29
9,urn:uuid:5910f480-7a4d-4214-9f41-d35cb4ce5599,female,1967-01-03,[Salisbury],[Massachusetts],[US],52


In [6]:
from pyspark.sql.functions import col
encs = encounters.select('id','subject.reference', 'status', 'type.text', 'class.code', explode('reason'), 'period.start', 'period.end') \
          .where(col('class.code').isin("inpatient", "emergency"))

encs.limit(10).toPandas()
#encs.toPandas()

Unnamed: 0,id,reference,status,text,code,col,start,end
0,urn:uuid:7659c043-a81a-4383-9c5f-5f2ce7e85d25,urn:uuid:9eb8222a-c8d2-426c-8e47-abb5659ead92,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2010-09-21T11:53:04-07:00,2010-09-21T13:23:04-07:00
1,urn:uuid:5ff5133f-b5f6-4b38-a5ed-46e408702cf2,urn:uuid:9eb8222a-c8d2-426c-8e47-abb5659ead92,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2013-08-06T11:53:04-07:00,2013-08-06T13:23:04-07:00
2,urn:uuid:f4908b46-7993-475e-ac88-6c0cb70af851,urn:uuid:9eb8222a-c8d2-426c-8e47-abb5659ead92,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2018-08-14T11:53:04-07:00,2018-08-14T13:23:04-07:00
3,urn:uuid:94b06819-4edd-4b85-a165-2999083eb850,urn:uuid:71d9d84e-a55a-4612-9cee-aeea3d063bcf,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2009-12-12T21:00:37-08:00,2009-12-12T22:30:37-08:00
4,urn:uuid:b5776c19-dba6-4ecb-aa39-55fe6ef0e7a2,urn:uuid:71d9d84e-a55a-4612-9cee-aeea3d063bcf,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2011-10-08T22:00:37-07:00,2011-10-08T23:30:37-07:00
5,urn:uuid:50265a79-6f67-427e-8328-2e15455cfd44,urn:uuid:71d9d84e-a55a-4612-9cee-aeea3d063bcf,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2016-01-02T21:00:37-08:00,2016-01-02T22:30:37-08:00
6,urn:uuid:a9969169-e2c8-475c-8d1f-f55ba21e1b4f,urn:uuid:5910f480-7a4d-4214-9f41-d35cb4ce5599,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2010-07-06T05:48:36-07:00,2010-07-06T07:03:36-07:00
7,urn:uuid:52148bd1-e235-46aa-b764-cad0cece857a,urn:uuid:5910f480-7a4d-4214-9f41-d35cb4ce5599,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2014-09-23T05:48:36-07:00,2014-09-23T07:18:36-07:00
8,urn:uuid:ca16bb8b-c2aa-4b2e-9586-9d34a1903799,urn:uuid:a7c00a2d-c14c-4183-9cda-b4b907cc4cd7,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2009-09-22T06:43:13-07:00,2009-09-22T08:13:13-07:00
9,urn:uuid:f59f041f-27ab-43ac-b538-be566117b35a,urn:uuid:a7c00a2d-c14c-4183-9cda-b4b907cc4cd7,finished,[Obstetric emergency hospital admission],emergency,"(None, [(None, http://snomed.info/sct, None, 7...",2011-09-20T06:43:13-07:00,2011-09-20T08:13:13-07:00


In [None]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from dateutil.parser import parse

def los(row):
    #start = datetime.strptime(row['start'], '%Y-%m-%dT%H:%M:%S%z')
    start = parse(row['start'])
    end = parse(row['end'])
    #end = datetime.strptime(row['end'], '%Y-%m-%dT%H:%M:%S%z')
    los = end-start
    return str(los)

def los2(startStr, endStr):
    #start = datetime.strptime(row['start'], '%Y-%m-%dT%H:%M:%S%z')
    start = parse(startStr)
    end = parse(endStr)
    #end = datetime.strptime(row['end'], '%Y-%m-%dT%H:%M:%S%z')
    los = end-start
    return str(los)  
  
encs=encounters.select('subject.reference', 
                  'class.code', 
                  'period.start', 
                  'period.end') \
          .where(col('class.code').isin("inpatient", "emergency"))

#encsWithLoS = encs.withColumn("los", los2("start", "end"))
#encsWithLoS.limit(10).toPandas()
encsDF = encs.limit(10).toPandas()
encsDF['los'] = encsDF.apply(los, axis=1)
#display(encsDF)
encsDF[['reference', 'code', 'los']]

# 