In [None]:
import pyspark
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
#from pyspark.sql.functions import sequence, to_date, explode, col
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder.getOrCreate() 

In [None]:
schema_vital = StructType([
    StructField("icustay_id", IntegerType()),
    StructField("charttime", TimestampType()),
    StructField("heartrate", DoubleType()),
    StructField("sysbp", DoubleType()),
    StructField("diasbp", DoubleType()),
    StructField("meanbp", DoubleType()),
    StructField("resprate", DoubleType()),
    StructField("tempc", DoubleType()),
    StructField("spo2", DoubleType()),
    StructField("glucose", DoubleType())
])

schema_fio2 = StructType([
    StructField("icustay_id", IntegerType()),
    StructField("charttime", TimestampType()),
    StructField("fio2", DoubleType())
    
])

schema_gcs = StructType([
    StructField("icustay_id", IntegerType()),
    StructField("charttime", TimestampType()),
    StructField("gcs", DoubleType()),
    StructField("gcsmotor", DoubleType()),
    StructField("gcsverbal", DoubleType()),
    StructField("gcseyes", DoubleType()),
    StructField("endotrachflag", IntegerType())    
])


In [None]:
df_vital = spark.read.csv('gs://peaceful-bruin-307600/derived/vital.csv', sep = ',', schema = schema_vital, header = True)
df_fio2 = spark.read.csv('gs://peaceful-bruin-307600/derived/fio2.csv', sep = ',', schema = schema_fio2, header = True)
df_gcs = spark.read.csv('gs://peaceful-bruin-307600/derived/gcs.csv', sep = ',', schema = schema_gcs, header = True)
df_clinical = spark.read.csv('gs://peaceful-bruin-307600/clinical_features_v2.csv', sep = ',', inferSchema= True, header = True)
df_cohort = spark.read.csv('gs://peaceful-bruin-307600/cohort_diag_v15.csv', sep = ',', inferSchema= True, header = True)
df_lab = spark.read.csv('gs://peaceful-bruin-307600/lab_results_final.csv', sep = ',', inferSchema= True, header = True)

In [None]:
df_vital.registerTempTable('vital')
df_fio2.registerTempTable('fio2')
df_gcs.registerTempTable('gcs')
df_clinical.registerTempTable('clinical')
df_cohort.registerTempTable('cohort')
df_lab.registerTempTable('lab')

In [None]:
#Convert minutes to hours for vitals, fio2, gcs data
df_vital_cleansed = df_vital.withColumn('charttime_hr', date_trunc('hour', df_vital.charttime))
df_vital_cleansed.registerTempTable('vital_cleansed')

df_fio2_cleansed = df_fio2.withColumn('charttime_hr', date_trunc('hour', df_fio2.charttime))
df_fio2_cleansed.registerTempTable('fio2_cleansed')

df_gcs_cleansed = df_gcs.withColumn('charttime_hr', date_trunc('hour', df_gcs.charttime))
df_gcs_cleansed.registerTempTable('gcs_cleansed')


In [None]:
#Get Average Vitals values where multiple readings occur in same chart hour, join vitals to cohort
query = \
"""
With AVG_VITALS_CTE AS (
select
    
    icustay_id
    ,charttime_hr
    ,avg(heartrate) as heartrate
    ,avg(sysbp) as sysbp
    ,avg(diasbp) as diasbp
    ,avg(meanbp) as meanbp
    ,avg(resprate) as resprate
    ,avg(tempc) as tempc
    ,avg(spo2) as spo2
    ,avg(glucose) as glucose
    ,avg((heartrate*1.0)/sysbp) as shockindex

from  vital_cleansed
group by icustay_id, charttime_hr
)

,AVG_FIO2_CTE AS (
select
    
    icustay_id
    ,charttime_hr
    ,avg(fio2) as fio2

from  fio2_cleansed
group by icustay_id, charttime_hr
)

,AVG_GCS_CTE AS (
select
    
    icustay_id
    ,charttime_hr
    ,avg(gcs) as gcs

from  gcs_cleansed
group by icustay_id, charttime_hr
)

--Join averages per hour to Cohort
select 
    c.*
    ,v.icustay_id as v_icustay_id
    ,v.charttime_hr as v_charttime_hr
    ,v.heartrate as v_heartrate
    ,v.sysbp as v_sysbp
    ,v.diasbp as v_diasbp
    ,v.meanbp as v_meanbp
    ,v.resprate as v_resprate
    ,v.tempc as v_tempc
    ,v.spo2 as v_spo2
    ,v.glucose as v_glucose
    ,v.shockindex as v_shockindex
    ,f.fio2 as v_fio2
    ,g.gcs as v_gcs

from cohort c 
     LEFT JOIN AVG_VITALS_CTE v ON (c.icustay_id = v.icustay_id) AND (c.timestamp = v.charttime_hr)
     LEFT JOIN AVG_FIO2_CTE f ON (c.icustay_id = f.icustay_id) AND (c.timestamp = f.charttime_hr)
     LEFT JOIN AVG_GCS_CTE g ON (c.icustay_id = g.icustay_id) AND (c.timestamp = g.charttime_hr)

where 1=1
      --and c.icustay_id = 200033

"""    

df_cohort_vitals = sqlContext.sql(query)
df_cohort_vitals.registerTempTable('cohort_vitals')
#df_cohort_vitals.repartition(1).write.option("header", "true").csv('gs://peaceful-bruin-307600/cohort_vitals_v2.csv')

In [None]:
#Get Average Lab values where multiple readings occur in same chart hour, join lab results to cohort

query = \
"""
With AVG_LAB_CTE AS (
select
    
    icustay_id
    ,(endtime - INTERVAL 1 HOUR) as charttime_hr
    ,avg(BUN) as BUN
    ,avg(CREATININE) as CREATININE
    ,avg((BUN*1.0)/CREATININE) as BUN_CREATININE_RATIO
    ,avg(WBC) as WBC
    ,avg(PLATELET) as PLATELET
    ,avg(PH) as PH
    ,avg(PO2) as PO2
    ,avg(liver) as liver
    ,avg(renal) as renal
    
    --liver_24hours
    --renal_24hours

from  lab
group by icustay_id, (endtime - INTERVAL 1 HOUR)
)

--Join averages per hour to cohort_vitals
select 
    cv.*
    ,l.icustay_id as l_icustay_id
    ,l.charttime_hr as l_charttime_hr
    ,l.BUN as l_BUN
    ,l.CREATININE as l_CREATININE
    ,l.BUN_CREATININE_RATIO as l_BUN_CREATININE_RATIO
    ,l.WBC as l_WBC
    ,l.PLATELET as l_PLATELET
    ,l.PH as l_PH
    ,l.PO2 as l_PO2
    ,l.liver as l_liver
    ,l.renal as l_renal
   
from cohort_vitals as cv
     LEFT JOIN AVG_LAB_CTE l ON (cv.icustay_id = l.icustay_id) AND (cv.timestamp = l.charttime_hr)

where 1=1
      --and c.icustay_id = 200033

"""    

df_cohort_vitals_lab = sqlContext.sql(query)
#df_cohort_vitals_lab.repartition(1).write.option("header", "true").csv('gs://peaceful-bruin-307600/cohort_vitals_lab_v2_247247.csv')

In [None]:
#Fill in Missing Values with Last Value if avaliable followed by latest value for missing preonset data
#Code Reference: Paul Lee's Lab Notebook, https://stackoverflow.com/questions/38131982/forward-fill-missing-values-in-spark-python
window = Window.partitionBy('icustay_id')\
       .orderBy('hour')\
       .rowsBetween(-1000000, 0)

#colsfill = ['v_heartrate', 'v_sysbp', 'v_diasbp', 'v_meanbp', 'v_resprate', 'v_tempc', 'v_spo2', 'v_glucose']
colsfill = ['v_heartrate', 'v_sysbp', 'v_resprate', 'v_shockindex', 'v_fio2', 'v_gcs', 'l_BUN', 'l_CREATININE', \
            'l_BUN_CREATININE_RATIO', 'l_WBC', 'l_PLATELET', 'l_PH', 'l_PO2', 'l_liver', 'l_renal', 'sofa_24hours',\
            'v_diasbp', 'v_meanbp', 'v_tempc', 'v_spo2', 'v_glucose']
            
for col in colsfill:
    df_cohort_vitals_lab = df_cohort_vitals_lab.withColumn(col, last(col,ignorenulls = True).over(window))   

window = Window.partitionBy('icustay_id')\
       .orderBy('hour')\
       .rowsBetween(0, Window.unboundedFollowing)

for col in colsfill:
    df_cohort_vitals_lab = df_cohort_vitals_lab.withColumn(col, first(col,ignorenulls = True).over(window))   
    
columns_to_drop = ['v_icustay_id', 'v_charttime_hr', 'l_icustay_id', 'l_charttime_hr']
df_cohort_vitals_lab= df_cohort_vitals_lab.drop(*columns_to_drop)
    
#df_cohort_vitals_lab.repartition(1).write.option("header", "true").csv('gs://peaceful-bruin-307600/cohort_vitals_lab_filled_V2_247247.csv')

In [None]:
#Where prior and post values do not exist (No Values Filled), fills in with population mean
#Code Reference: Paul Lee's Lab Notebook
def fill_with_mean(df, exclude=set()): 
    stats = df.agg(*(
        avg(c).alias(c) for c in df.columns if c not in exclude
    ))
    return df.na.fill(stats.first().asDict())


df_cohort_vitals_lab = fill_with_mean(df_cohort_vitals_lab, ["subject_id","hadm_id","icustay_id","dbsource","first_careunit","icu_length_of_stay",\
                                              "intime","outtime","age","gender","ethnicity","HAS_CHARTEVENTS_DATA","icustay_id_order",\
                                              "exclusion_los","exclusion_age","exclusion_first_stay","exclusion_surgical","exclusion_icu_db",\
                                              "exclusion_bad_data","suspected_infection_time_poe","inf_window_start","inf_window_end",\
                                              "exclusion_sus_inf_window","intime_round","outtime_round","hour","timestamp",\
                                              #"sofa_24hours",\
                                              "Sepsis3_start_flg","Sepsis3_diag_flg","sepsis_onset_hr"])

df_cohort_vitals_lab.registerTempTable('cohort_vitals_lab')
#df_cohort_vitals_lab.repartition(1).write.option("header", "true").csv('gs://peaceful-bruin-307600/cohort_vitals_lab_filled_mean.csv')

In [None]:
#Join Clincial Data to df_cohort_vitals
query = \
"""
select 
    cvl.*
    ,c.CLD_CODES as c_CLD_CODES
    ,c.HM_CODES as c_HM_CODES
    ,c.CHF_CODES as c_CHF_CODES
    ,c.COI_CODES as c_COI_CODES
    ,c.DIAB_CODES as c_DIAB_CODES
    ,c.MC_CODES as c_MC_CODES
    ,c.UR_AMT as c_UR_AMT
    ,c.SIRS as c_SIRS
    ,c.CSURG as c_CSURG

from cohort_vitals_lab cvl
     LEFT JOIN clinical c ON (cvl.icustay_id = c.icustay_id) 
where 1=1
    --and c.icustay_id = 200033

"""    
df_cohort_vitals_lab_clinical = sqlContext.sql(query)

In [None]:
columns_to_drop = ["hadm_id","dbsource","first_careunit","icu_length_of_stay",\
                   "intime","outtime","HAS_CHARTEVENTS_DATA","icustay_id_order",\
                   "exclusion_los","exclusion_age","exclusion_first_stay","exclusion_surgical","exclusion_icu_db",\
                   "exclusion_bad_data","suspected_infection_time_poe","inf_window_start","inf_window_end",\
                   "exclusion_sus_inf_window","intime_round","outtime_round"]
                   #"sofa_24hours"]

df_cohort_vitals_lab_clinical = df_cohort_vitals_lab_clinical.drop(*columns_to_drop)
df_cohort_vitals_lab_clinical.repartition(1).write.option("header", "true").csv('gs://peaceful-bruin-307600/cohort_vitals_lab_clinical_v12.csv')

In [None]:
#Test cohort_vitals_clinical
df_cohort_vitals_clinical.registerTempTable('cohort_vitals_clinical')

query = \
"""
select *
from cohort_vitals_lab_clinical 
where 1=1
      and icustay_id = 200033

"""    

df_cohort_vitals_lab_clinical_test = sqlContext.sql(query)
df_cohort_vitals_lab_clinical_test.repartition(1).write.option("header", "true").csv('gs://peaceful-bruin-307600/cohort_vitals_lab_clinical_test.csv')