Clinical Features Tables


In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
from google.colab import auth
auth.authenticate_user()

In [29]:
!echo “deb http://packages.cloud.google.com/apt gcsfuse-bionic main” > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  2537  100  2537    0     0  97576      0 --:--:-- --:--:-- --:--:-- 97576
OK
E: Type '“deb' is not known on line 1 in source list /etc/apt/sources.list.d/gcsfuse.list
E: The list of sources could not be read.
E: Type '“deb' is not known on line 1 in source list /etc/apt/sources.list.d/gcsfuse.list
E: The list of sources could not be read.
E: Type '“deb' is not known on line 1 in source list /etc/apt/sources.list.d/gcsfuse.list
E: The list of sources could not be read.


In [30]:
!mkdir mimic_tables
!gcsfuse --implicit-dirs mimic_test mimic_tables

mkdir: cannot create directory ‘mimic_tables’: File exists
2021/04/18 02:32:22.428541 Using mount point: /content/mimic_tables
2021/04/18 02:32:22.438473 Opening GCS connection...
2021/04/18 02:32:22.724577 Mounting file system "mimic_test"...
2021/04/18 02:32:22.725298 File system has been successfully mounted.


In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!pip install -q findspark

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Waiting for headers] [1 0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Waiting for headers] [Co                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Waiting for headers] [Co0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Get:4 http://security.ubuntu.com/ubuntu bionic-s

In [31]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark import SparkContext

import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import *
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, when, lit, sum, to_timestamp, count, avg

spark = SparkSession.builder.getOrCreate() 

In [32]:
def load_data(gcp_path):
    sqlContext = SQLContext(sc)
    df = sqlContext.read.option("header", True).csv(gcp_path)
    return df

In [34]:
''' 
    CLINICAL FEATURE: Chronic liver disease and cirrhosis
    Presence of chronic liver disease and cirrhosis as indicated by ICD-9 571
    
    OUTPUT: SUBJECT_ID, HADM_ID, ICUSTAY_ID, CLD_CODES
    
'''

def cld_feature():
    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays = icu_stays.withColumn("SUBJECT_ID", icu_stays["SUBJECT_ID"].cast(IntegerType()))
    diag_icu = load_data("/content/drive/MyDrive/mimiciii/DIAGNOSES_ICD.csv")

    temp = icu_stays.join(diag_icu, (icu_stays['SUBJECT_ID'] == diag_icu['SUBJECT_ID'])
                                  & (icu_stays['HADM_ID'] == diag_icu['HADM_ID'])
                         , how='left')
    temp = temp.withColumn("CLD_CODES", when(col("ICD9_CODE").like("571%"), 1).otherwise(0))
    temp = temp.select(icu_stays['SUBJECT_ID'], icu_stays['HADM_ID'], icu_stays['ICUSTAY_ID'], 'CLD_CODES')
    temp = temp.groupby('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID').agg(sum('CLD_CODES'))
    temp = temp.withColumn("CLD_CODES", when(col("sum(CLD_CODES)") > 0, 1).otherwise(0))

    final = temp.select('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CLD_CODES')
    final = final.sort('SUBJECT_ID')

    return final

In [35]:
'''
    CLINICAL FEATURE: Immunocompromised
    Immunocompromised (patient has received past therapy that suppresses resistance to infection)
    as indicated by presence of any ICD-9 in V58.65, V58.0, V58.1, 042, 208.0, 202
    
    OUTPUT: SUBJECT_ID, HADM_ID, ICUSTAY_ID, IC_CODES
    
'''

def ic_feature():
    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays = icu_stays.withColumn("SUBJECT_ID", icu_stays["SUBJECT_ID"].cast(IntegerType()))
    diag_icu = load_data("/content/drive/MyDrive/mimiciii/DIAGNOSES_ICD.csv")

    temp = icu_stays.join(diag_icu, (icu_stays['SUBJECT_ID'] == diag_icu['SUBJECT_ID'])
                                  & (icu_stays['HADM_ID'] == diag_icu['HADM_ID']))
    temp = temp.withColumn('IC_CODES', when(((col("ICD9_CODE") == 'V5865')
                     | (col("ICD9_CODE") == 'V580')
                     | (col("ICD9_CODE") == '042')
                     | (col("ICD9_CODE").like("V581%"))
                     | (col("ICD9_CODE").like("2080%"))
                     | (col("ICD9_CODE").like("202%"))
                      ), 1).otherwise(0))
    temp = temp.select(icu_stays['SUBJECT_ID'], icu_stays['HADM_ID'], icu_stays['ICUSTAY_ID'], 'IC_CODES')
    temp = temp.groupby('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID').agg(sum('IC_CODES'))
    temp = temp.withColumn("IC_CODES", when(col("sum(IC_CODES)") > 0, 1).otherwise(0))

    final = temp.select('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'IC_CODES')
    final = final.sort('SUBJECT_ID')

    return final

In [36]:
'''
    CLINICAL FEATURE: Hematological malignancy
    Presence of hematologic malignancy as indicated by any ICD-9 code in 200-208
    
    OUTPUT: SUBJECT_ID, HADM_ID, ICUSTAY_ID, HM_CODES
  
'''

def hm_feature():
    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays = icu_stays.withColumn("SUBJECT_ID", icu_stays["SUBJECT_ID"].cast(IntegerType()))
    diag_icu = load_data("/content/drive/MyDrive/mimiciii/DIAGNOSES_ICD.csv")

    temp = icu_stays.join(diag_icu, (icu_stays['SUBJECT_ID'] == diag_icu['SUBJECT_ID'])
                                  & (icu_stays['HADM_ID'] == diag_icu['HADM_ID']))
    temp = temp.withColumn('HM_CODES', when(((col("ICD9_CODE").like("200%"))
                     | (col("ICD9_CODE").like("201%"))
                     | (col("ICD9_CODE").like("202%"))
                     | (col("ICD9_CODE").like("203%"))
                     | (col("ICD9_CODE").like("204%"))
                     | (col("ICD9_CODE").like("205%"))
                     | (col("ICD9_CODE").like("206%"))
                     | (col("ICD9_CODE").like("207%"))
                     | (col("ICD9_CODE").like("208%"))
                    ), 1).otherwise(0))

    temp = temp.select(icu_stays['SUBJECT_ID'], icu_stays['HADM_ID'], icu_stays['ICUSTAY_ID'], 'HM_CODES')
    temp = temp.groupby('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID').agg(sum('HM_CODES'))
    temp = temp.withColumn("HM_CODES", when(col("sum(HM_CODES)") > 0, 1).otherwise(0))

    final = temp.select('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'HM_CODES')
    final = final.sort('SUBJECT_ID')

    return final

In [37]:
'''
    CLINICAL FEATURE: Chronic heart failure
    Presence of heart failure as indicated by ICD-9 code 428
    
    OUTPUT: SUBJECT_ID, HADM_ID, ICUSTAY_ID, CHF_CODES
    
'''

def chf_feature():
    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays = icu_stays.withColumn("SUBJECT_ID", icu_stays["SUBJECT_ID"].cast(IntegerType()))
    diag_icu = load_data("/content/drive/MyDrive/mimiciii/DIAGNOSES_ICD.csv")

    temp = icu_stays.join(diag_icu, (icu_stays['SUBJECT_ID'] == diag_icu['SUBJECT_ID'])
                                  & (icu_stays['HADM_ID'] == diag_icu['HADM_ID']))
    temp = temp.withColumn('CHF_CODES', when(col("ICD9_CODE").like("428%"), 1).otherwise(0))
    
    temp = temp.select(icu_stays['SUBJECT_ID'], icu_stays['HADM_ID'], icu_stays['ICUSTAY_ID'], 'CHF_CODES')
    temp = temp.groupby('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID').agg(sum('CHF_CODES'))
    temp = temp.withColumn("CHF_CODES", when(col("sum(CHF_CODES)") > 0, 1).otherwise(0))

    final = temp.select('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHF_CODES')
    final = final.sort('SUBJECT_ID')
    
    return final

In [38]:
'''
    CLINICAL FEATURE: Chronic Organ Insufficiency
    Severe organ insufficiency (chronic liver disease, chronic heart failure,
    chronic respiratory failure, receiving chronic dialysis) as indicated by
    one of the ICD-9 codes 571, 585.6, 428.22, 428.32, 428.42, 518.83
    
    OUTPUT: SUBJECT_ID, HADM_ID, ICUSTAY_ID, COI_CODES
    
'''

def coi_feature():
    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays = icu_stays.withColumn("SUBJECT_ID", icu_stays["SUBJECT_ID"].cast(IntegerType()))
    diag_icu = load_data("/content/drive/MyDrive/mimiciii/DIAGNOSES_ICD.csv")

    temp = icu_stays.join(diag_icu, (icu_stays['SUBJECT_ID'] == diag_icu['SUBJECT_ID'])
                                  & (icu_stays['HADM_ID'] == diag_icu['HADM_ID']))
    temp = temp.withColumn('COI_CODES', when(((col("ICD9_CODE").like("571%"))
                     | (col("ICD9_CODE").like("5856%"))
                     | (col("ICD9_CODE").like("42822%"))
                     | (col("ICD9_CODE").like("42832%"))
                     | (col("ICD9_CODE").like("42842%"))
                     | (col("ICD9_CODE").like("51883%"))
                      ), 1).otherwise(0))

    temp = temp.select(icu_stays['SUBJECT_ID'], icu_stays['HADM_ID'], icu_stays['ICUSTAY_ID'], 'COI_CODES')
    temp = temp.groupby('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID').agg(sum('COI_CODES'))
    temp = temp.withColumn("COI_CODES", when(col("sum(COI_CODES)") > 0, 1).otherwise(0))

    final = temp.select('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'COI_CODES')
    final = final.sort('SUBJECT_ID')
    
    return final

In [39]:
'''
    CLINICAL FEATURE: Diabetes
    Presence of diabetes as indicated by ICD-9 code 250
    
    OUTPUT: SUBJECT_ID, HADM_ID, ICUSTAY_ID, DIAB_CODES
    
'''

def diab_feature():
    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays = icu_stays.withColumn("SUBJECT_ID", icu_stays["SUBJECT_ID"].cast(IntegerType()))
    diag_icu = load_data("/content/drive/MyDrive/mimiciii/DIAGNOSES_ICD.csv")

    temp = icu_stays.join(diag_icu, (icu_stays['SUBJECT_ID'] == diag_icu['SUBJECT_ID'])
                                  & (icu_stays['HADM_ID'] == diag_icu['HADM_ID']))
    temp = temp.withColumn('DIAB_CODES', when(col("ICD9_CODE").like("250%"), 1).otherwise(0))

    temp = temp.select(icu_stays['SUBJECT_ID'], icu_stays['HADM_ID'], icu_stays['ICUSTAY_ID'], 'DIAB_CODES')
    temp = temp.groupby('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID').agg(sum('DIAB_CODES'))
    temp = temp.withColumn("DIAB_CODES", when(col("sum(DIAB_CODES)") > 0, 1).otherwise(0))

    final = temp.select('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'DIAB_CODES')
    final = final.sort('SUBJECT_ID')

    return final

In [40]:
'''
    CLINICAL FEATURE: Metastatic carcinoma
    Metastatic carcinoma as indicated by 0.05 presence of any ICD-9 codes in 140-165, 170-175, 179-199
    
    OUTPUT: SUBJECT_ID, HADM_ID, ICUSTAY_ID, MC_CODES
    
'''

def mc_feature():
    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays = icu_stays.withColumn("SUBJECT_ID", icu_stays["SUBJECT_ID"].cast(IntegerType()))
    diag_icu = load_data("/content/drive/MyDrive/mimiciii/DIAGNOSES_ICD.csv")

    temp = icu_stays.join(diag_icu, (icu_stays['SUBJECT_ID'] == diag_icu['SUBJECT_ID'])
                                  & (icu_stays['HADM_ID'] == diag_icu['HADM_ID']))
    temp = temp.withColumn('MC_CODES', when(((col("ICD9_CODE").like("140%"))
                     | (col("ICD9_CODE").like("141%"))
                     | (col("ICD9_CODE").like("142%"))
                     | (col("ICD9_CODE").like("143%"))
                     | (col("ICD9_CODE").like("144%"))
                     | (col("ICD9_CODE").like("145%"))
                     | (col("ICD9_CODE").like("146%"))
                     | (col("ICD9_CODE").like("147%"))
                     | (col("ICD9_CODE").like("148%"))
                     | (col("ICD9_CODE").like("149%"))
                     | (col("ICD9_CODE").like("150%"))
                     | (col("ICD9_CODE").like("151%"))
                     | (col("ICD9_CODE").like("152%"))
                     | (col("ICD9_CODE").like("153%"))
                     | (col("ICD9_CODE").like("154%"))
                     | (col("ICD9_CODE").like("155%"))
                     | (col("ICD9_CODE").like("156%"))
                     | (col("ICD9_CODE").like("157%"))
                     | (col("ICD9_CODE").like("158%"))
                     | (col("ICD9_CODE").like("159%"))
                     | (col("ICD9_CODE").like("160%"))
                     | (col("ICD9_CODE").like("161%"))
                     | (col("ICD9_CODE").like("162%"))
                     | (col("ICD9_CODE").like("163%"))
                     | (col("ICD9_CODE").like("164%"))
                     | (col("ICD9_CODE").like("165%"))
                     | (col("ICD9_CODE").like("170%"))
                     | (col("ICD9_CODE").like("171%"))
                     | (col("ICD9_CODE").like("172%"))
                     | (col("ICD9_CODE").like("173%"))
                     | (col("ICD9_CODE").like("174%"))
                     | (col("ICD9_CODE").like("175%"))
                     | (col("ICD9_CODE").like("179%"))
                     | (col("ICD9_CODE").like("180%"))
                     | (col("ICD9_CODE").like("181%"))
                     | (col("ICD9_CODE").like("182%"))
                     | (col("ICD9_CODE").like("183%"))
                     | (col("ICD9_CODE").like("184%"))
                     | (col("ICD9_CODE").like("185%"))
                     | (col("ICD9_CODE").like("186%"))
                     | (col("ICD9_CODE").like("187%"))
                     | (col("ICD9_CODE").like("188%"))
                     | (col("ICD9_CODE").like("189%"))
                     | (col("ICD9_CODE").like("190%"))
                     | (col("ICD9_CODE").like("191%"))
                     | (col("ICD9_CODE").like("192%"))
                     | (col("ICD9_CODE").like("193%"))
                     | (col("ICD9_CODE").like("194%"))
                     | (col("ICD9_CODE").like("195%"))
                     | (col("ICD9_CODE").like("196%"))
                     | (col("ICD9_CODE").like("197%"))
                     | (col("ICD9_CODE").like("198%"))
                     | (col("ICD9_CODE").like("199%"))  
                      ), 1).otherwise(0))
    
    temp = temp.select(icu_stays['SUBJECT_ID'], icu_stays['HADM_ID'], icu_stays['ICUSTAY_ID'], 'MC_CODES')
    temp = temp.groupby('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID').agg(sum('MC_CODES'))
    temp = temp.withColumn("MC_CODES", when(col("sum(MC_CODES)") > 0, 1).otherwise(0))

    final = temp.select('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'MC_CODES')
    final = final.sort('SUBJECT_ID')
    
    return final

In [41]:
'''
    CLINICAL FEATURE: Urine 6Hr
    Total urine output over the past 6 hours
    ** Not sure past 6 hrs from what event...
    So this looks at average urine output per patient per ICU Admission
    -if there is no data on urine output for a patient,then the average urine output was used instead (163.6 mL)
    
    OUTPUT: SUBJECT_ID, HADM_ID, ICUSTAY_ID, UR_AMT
    
    BIGQUERY SQL: built based on
        https://github.com/MIT-LCP/mimic-code/blob/master/concepts/fluid_balance/urine_output.sql
'''

def ur_feature():
    output_events = load_data("/content/drive/MyDrive/mimiciii/OUTPUTEVENTS.csv")
    output_events = output_events.withColumn("SUBJECT_ID", output_events["SUBJECT_ID"].cast(IntegerType()))
    output_events = output_events.withColumn("VALUE", output_events["VALUE"].cast(IntegerType()))

    ur_chart_events = output_events.filter((col("ITEMID") == 40055)
                                         | (col("ITEMID") == 433175)
                                         | (col("ITEMID") == 40069)
                                         | (col("ITEMID") == 40715)
                                         | (col("ITEMID") == 40473)
                                         | (col("ITEMID") == 40085)
                                         | (col("ITEMID") == 40057)
                                         | (col("ITEMID") == 40056)
                                         | (col("ITEMID") == 40405)
                                         | (col("ITEMID") == 40428)
                                         | (col("ITEMID") == 40086)
                                         | (col("ITEMID") == 40096)
                                         | (col("ITEMID") == 40651)
                                         | (col("ITEMID") == 226559)
                                         | (col("ITEMID") == 226560)
                                         | (col("ITEMID") == 226561)
                                         | (col("ITEMID") == 226584)
                                         | (col("ITEMID") == 226563)
                                         | (col("ITEMID") == 226564)
                                         | (col("ITEMID") == 226565)
                                         | (col("ITEMID") == 226567)
                                         | (col("ITEMID") == 226557)
                                         | (col("ITEMID") == 226558)
    #                                      | (col("ITEMID") == 227488)
                                         | (col("ITEMID") == 227489)
                                        )

    ur_chart_events = ur_chart_events.groupBy('SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID').agg(avg("VALUE"))
    ur_chart_events = ur_chart_events.withColumnRenamed('avg(VALUE)', 'UR_AMT')
    ur_chart_events = ur_chart_events.filter(col("ICUSTAY_ID").isNotNull())
    ur_chart_events = ur_chart_events.orderBy(col('SUBJECT_ID').asc())

    avg_ur_amt = ur_chart_events.agg(avg("UR_AMT"))
    avg_ur_amt = list(avg_ur_amt.select("avg(UR_AMT)").toPandas()["avg(UR_AMT)"])[0]

    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays = icu_stays.withColumn("SUBJECT_ID", icu_stays["SUBJECT_ID"].cast(IntegerType()))

    final = icu_stays.join(ur_chart_events, (ur_chart_events['SUBJECT_ID'] == icu_stays['SUBJECT_ID'])
                                          & (ur_chart_events['ICUSTAY_ID'] == icu_stays['ICUSTAY_ID'])
                                          & (ur_chart_events['HADM_ID'] == icu_stays['HADM_ID'])
                                            , how="left")
    final = final.sort(icu_stays['SUBJECT_ID'])
    final = final.withColumn('UR_AMT', when(col('UR_AMT').isNull(), avg_ur_amt).otherwise(col('UR_AMT')))

    final = final.select(icu_stays['SUBJECT_ID']
                     , icu_stays['ICUSTAY_ID']
                     , icu_stays['HADM_ID']
                     , col("UR_AMT")
                     )

    return final

In [69]:
'''
  CLINICAL FEATURE:  Presence of at least two of the SIRS criteria at the current time

  OUTPUT: SUBJECT_ID, HADM_ID, ICUSTAY_ID, UR_AMT
    
  BIGQUERY SQL: built based on
    https://github.com/MIT-LCP/mimic-code/blob/master/concepts/severityscores/sirs.sql

'''
def sirs_feature():
    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays.registerTempTable('icu_stays')
    vitals_first_day = load_data("/content/mimic_tables/vitals_first_day.csv")
    vitals_first_day.registerTempTable("vitals_first_day")
    labs_first_day = load_data("/content/mimic_tables/labs_first_day.csv")
    labs_first_day.registerTempTable("labs_first_day")
    blood_gas_first_day_arterial = load_data("/content/mimic_tables/blood_gas_first_day_arterial.csv")
    blood_gas_first_day_arterial.registerTempTable("blood_gas_first_day_arterial")
    query = \
    '''
        with bg as
        (
          -- join blood gas to ventilation durations to determine if patient was vent
          select bg.icustay_id
          , min(pco2) as paco2_min
          from blood_gas_first_day_arterial bg
          where specimen_pred = 'ART'
          group by bg.icustay_id
        )
        -- Aggregate the components for the score
        , scorecomp as
        (
        select ie.icustay_id
          , v.tempc_min
          , v.tempc_max
          , v.heartrate_max
          , v.resprate_max
          , bg.paco2_min
          , l.wbc_min
          , l.wbc_max
          , l.bands_max
        FROM icu_stays ie
        left join bg
        on ie.icustay_id = bg.icustay_id
        left join vitals_first_day v
          on ie.icustay_id = v.icustay_id
        left join labs_first_day l
          on ie.icustay_id = l.icustay_id
        )
        , scorecalc as
        (
          -- Calculate the final score
          -- note that if the underlying data is missing, the component is null
          -- eventually these are treated as 0 (normal), but knowing when data is missing is useful for debugging
          select icustay_id

          , case
              when tempc_min < 36.0 then 1
              when tempc_max > 38.0 then 1
              when tempc_min is null then null
              else 0
            end as temp_score


          , case
              when heartrate_max > 90.0  then 1
              when heartrate_max is null then null
              else 0
            end as heartrate_score

          , case
              when resprate_max > 20.0  then 1
              when paco2_min < 32.0  then 1
              when coalesce(resprate_max, paco2_min) is null then null
              else 0
            end as resp_score

          , case
              when wbc_min <  4.0  then 1
              when wbc_max > 12.0  then 1
              when bands_max > 10 then 1-- > 10% immature neurophils (band forms)
              when coalesce(wbc_min, bands_max) is null then null
              else 0
            end as wbc_score

          from scorecomp
        )
        select
          ie.subject_id, ie.hadm_id, ie.icustay_id
          -- Combine all the scores to get SOFA
          -- Impute 0 if the score is missing
          , coalesce(temp_score,0)
          + coalesce(heartrate_score,0)
          + coalesce(resp_score,0)
          + coalesce(wbc_score,0)
            as sirs
          , temp_score, heartrate_score, resp_score, wbc_score
        FROM icu_stays ie
        left join scorecalc s
          on ie.icustay_id = s.icustay_id
        order by ie.icustay_id;
    '''

    sirs = spark.sql(query)
    sirs = sirs.withColumn("subject_id", sirs["subject_id"].cast(IntegerType()))
    sirs = sirs.withColumn("SIRS", when(col('sirs') >= 2, 1).otherwise(0)).select("subject_id", "hadm_id", "icustay_id", "SIRS")
    sirs = sirs.sort('subject_id')

    return sirs


In [123]:
'''
    CLINICAL FEATURE:  Cardiac Surgery Patient
    OUTPUT: SUBJECT_ID, HADM_ID, CSURG

'''
def cardiac_surg():

    services = load_data("/content/drive/MyDrive/mimiciii/SERVICES.csv")
    services = services.withColumn("SUBJECT_ID", services["SUBJECT_ID"].cast(IntegerType()))

    csurg = services.withColumn("CSURG", when(col("CURR_SERVICE") == "CSURG", 1).otherwise(0))
    csurg = csurg.groupby("SUBJECT_ID", "HADM_ID").agg(sum("CSURG")).withColumnRenamed("sum(CSURG)", "CSURG")
    csurg = csurg.withColumn("CSURG", when(col("CSURG") >= 1, 1).otherwise(0))

    icu_stays = load_data("/content/drive/MyDrive/mimiciii/ICUSTAYS.csv")
    icu_stays = icu_stays.withColumn("SUBJECT_ID", icu_stays["SUBJECT_ID"].cast(IntegerType()))

    cardiac_surg = icu_stays.join(csurg, ((icu_stays['SUBJECT_ID'] == csurg['SUBJECT_ID']) & (icu_stays['HADM_ID'] == csurg['HADM_ID'])), "left")
    cardiac_surg = cardiac_surg.withColumn("CSURG", when(col("CSURG").isNull(), 0).otherwise(col("CSURG")))
    cardiac_surg = cardiac_surg.select(icu_stays['SUBJECT_ID'], icu_stays['HADM_ID'], icu_stays['ICUSTAY_ID'], "CSURG")
    cardiac_surg = cardiac_surg.sort("SUBJECT_ID")

    return cardiac_surg


In [124]:
cld = cld_feature()
ic = ic_feature()
hm = hm_feature()
chf = chf_feature()
coi = coi_feature()
diab = diab_feature()
mc = mc_feature()
ur = ur_feature()
sirs = sirs_feature()
csurg = cardiac_surg()

In [125]:
clinical_features = cld.join(ic, (cld['SUBJECT_ID'] == ic['SUBJECT_ID'])
                               & (cld['HADM_ID'] == ic['HADM_ID'])
                               & (cld['ICUSTAY_ID'] == ic['ICUSTAY_ID']))
clinical_features = clinical_features.join(hm, (cld['SUBJECT_ID'] == hm['SUBJECT_ID'])
                                             & (cld['HADM_ID'] == hm['HADM_ID'])
                                             & (cld['ICUSTAY_ID'] == hm['ICUSTAY_ID']))
clinical_features = clinical_features.join(chf, (cld['SUBJECT_ID'] == chf['SUBJECT_ID'])
                                              & (cld['HADM_ID'] == chf['HADM_ID'])
                                              & (cld['ICUSTAY_ID'] == chf['ICUSTAY_ID']))
clinical_features = clinical_features.join(coi, (cld['SUBJECT_ID'] == coi['SUBJECT_ID'])
                                              & (cld['HADM_ID'] == coi['HADM_ID'])
                                              & (cld['ICUSTAY_ID'] == coi['ICUSTAY_ID']))
clinical_features = clinical_features.join(diab, (cld['SUBJECT_ID'] == diab['SUBJECT_ID'])
                                               & (cld['HADM_ID'] == diab['HADM_ID'])
                                               & (cld['ICUSTAY_ID'] == diab['ICUSTAY_ID']))
clinical_features = clinical_features.join(mc, (cld['SUBJECT_ID'] == mc['SUBJECT_ID'])
                                             & (cld['HADM_ID'] == mc['HADM_ID'])
                                             & (cld['ICUSTAY_ID'] == mc['ICUSTAY_ID']))
clinical_features = clinical_features.join(ur, (cld['SUBJECT_ID'] == ur['SUBJECT_ID'])
                                             & (cld['HADM_ID'] == ur['HADM_ID'])
                                             & (cld['ICUSTAY_ID'] == ur['ICUSTAY_ID']))
clinical_features = clinical_features.join(sirs, (cld['SUBJECT_ID'] == sirs['subject_id'])
                                             & (cld['HADM_ID'] == sirs['hadm_id'])
                                             & (cld['ICUSTAY_ID'] == sirs['icustay_id']))
clinical_features = clinical_features.join(csurg, (cld['SUBJECT_ID'] == csurg['SUBJECT_ID'])
                                             & (cld['HADM_ID'] == csurg['HADM_ID'])
                                             & (cld['ICUSTAY_ID'] == csurg['ICUSTAY_ID']))
clinical_features = clinical_features.select(cld['SUBJECT_ID']
                                            , cld['HADM_ID']
                                            , cld['ICUSTAY_ID']
                                            , 'CLD_CODES'
                                            , 'HM_CODES'
                                            , 'CHF_CODES'
                                            , 'COI_CODES'
                                            , 'DIAB_CODES'
                                            , 'MC_CODES'
                                            , 'UR_AMT'
                                            , 'SIRS'
                                            , 'CSURG')
clinical_features = clinical_features.sort('SUBJECT_ID')
clinical_features.show()

+----------+-------+----------+---------+--------+---------+---------+----------+--------+------------------+----+-----+
|SUBJECT_ID|HADM_ID|ICUSTAY_ID|CLD_CODES|HM_CODES|CHF_CODES|COI_CODES|DIAB_CODES|MC_CODES|            UR_AMT|SIRS|CSURG|
+----------+-------+----------+---------+--------+---------+---------+----------+--------+------------------+----+-----+
|         2| 163353|    243653|        0|       0|        0|        0|         0|       0|163.60768834335641|   1|    0|
|         3| 145834|    211552|        0|       0|        1|        0|         0|       0| 142.9433962264151|   1|    0|
|         4| 185777|    294638|        1|       0|        0|        1|         0|       0| 508.3333333333333|   1|    0|
|         5| 178980|    214757|        0|       0|        0|        0|         0|       0|163.60768834335641|   0|    0|
|         6| 107064|    228232|        0|       0|        0|        0|         0|       0|  80.6896551724138|   0|    0|
|         7| 118037|    236754| 

In [127]:
# clinical_features.toPandas().to_csv('/content/drive/MyDrive/CSE6250_Est/Model/clinical_features.csv')