<a href="https://colab.research.google.com/github/rmadushani/Three_vasopressor_problem/blob/main/eICU/codes/eICU_Three_Vasopressors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
###
# @author: R. W. M. A. Madushani
# Created on Dec 20, 2019
###

In [2]:
# Imports for accessing eICU data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# Import libraries
import os
import pandas as pd
import numpy as np

In [3]:
auth.authenticate_user()

In [4]:
project_id = 'sccm-datathon'
os.environ['GOOGLE_CLOUD_PROJECT'] = project_id

In [5]:
# Read data from BigQuery into pandas dataframes.
def run_query(query):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# Extract and clean demographic data

In [6]:
# Extract relevant patient data
patient_dat_query = \
"""
SELECT uniquepid, patientHealthSystemStayID, patientUnitStayID, uniquepid, gender, age, ethnicity, unitType, hospitalDischargeLocation, hospitalDischargeStatus
FROM `physionet-data.eicu_crd.patient`
"""

patient_dat = run_query(patient_dat_query)

## We consider only 'MICU', and 'Med-Surg ICU'
patient_dat_new = patient_dat.loc[(patient_dat.unitType=='MICU') | (patient_dat.unitType=='Med-Surg ICU'),: ].copy()

## Convert age>89 strings to numeric to extract patients with age>=18
patient_dat_new['age_int'] = patient_dat_new['age'].apply(lambda x: 90 if x=='> 89' else (-999 if x=='' else int(x)))
## Filter patients with age >= 18
patient_dat_new = patient_dat_new.loc[patient_dat_new.age_int >= 18, :]

## Remove records with missing hospital discharge status i.e., mortality
patient_dat_new = patient_dat_new.loc[patient_dat_new.hospitalDischargeStatus!='', :]
patient_dat_new.head()

Unnamed: 0,uniquepid,patientHealthSystemStayID,patientUnitStayID,uniquepid_1,gender,age,ethnicity,unitType,hospitalDischargeLocation,hospitalDischargeStatus,age_int
0,011-43764,844848,1137569,011-43764,Female,75,Other/Unknown,MICU,Death,Expired,75
1,030-57208,2462651,3036927,030-57208,Male,51,Caucasian,Med-Surg ICU,Home,Alive,51
2,030-13634,2481889,3058863,030-13634,Female,31,Caucasian,Med-Surg ICU,Home,Alive,31
3,030-53536,2494105,3072720,030-53536,Male,58,Caucasian,Med-Surg ICU,Home,Alive,58
4,030-59276,2496542,3075429,030-59276,Female,68,Caucasian,Med-Surg ICU,Death,Expired,68


In [7]:
## Ethnicity types
patient_dat_new.ethnicity.unique()

array(['Other/Unknown', 'Caucasian', 'Hispanic', 'African American',
       'Asian', '', 'Native American'], dtype=object)

In [8]:
## Convert missing ethnicity string '' to none
patient_dat_new.loc[patient_dat_new.ethnicity=='', 'ethnicity'] = None

In [9]:
(patient_dat_new.groupby('uniquepid')['ethnicity'].nunique()>1).sum() ## There are 427 patients with multiple ethnicies assigned

359

In [10]:
## Find patients with no ethnicity information available
ethnicity_missing = patient_dat_new.groupby('uniquepid')['ethnicity'].apply(lambda x: x.isnull().values.all())
ethnicity_missing.head()

uniquepid
002-10009    False
002-10018    False
002-10034    False
002-10050    False
002-10052    False
Name: ethnicity, dtype: bool

In [11]:
## Remove patients who don't have any ethnicity information available for any of their ICU stays (However, there were unique 1348 such patients)
## We clean ethnicity infomation by assigning a unique ethnicity for each patient after we extract our cohort of third vasopressor patients
patient_dat_new = patient_dat_new.loc[patient_dat_new.uniquepid.isin(ethnicity_missing [~ethnicity_missing].index),:]

In [12]:
## List of ICU stays ids to extract other variables
patientUnitStayIDs = list(patient_dat_new.patientUnitStayID)

# Extract and clean vasopressor medication data

In [13]:
# Extract vasopressor related medication data
vasp_med_query = \
"""
SELECT medicationid, patientunitstayid, drugstartoffset, drugname, drughiclseqno, routeadmin, frequency, drugstopoffset, gtc
FROM `physionet-data.eicu_crd.medication`
WHERE (drugname is null OR (UPPER(drugname) like '%EPINEPHRIN%') OR (UPPER(drugname) LIKE '%ADRENALIN%') OR (UPPER(drugname) LIKE '%PHENYLEPHRIN%') OR (UPPER(drugname) LIKE '%VASOPRESSIN%') OR (UPPER(drugname) like '%DOPAMINE%') OR (UPPER(drugname) like '%ANGIOTENSIN%')) 
"""

med_vasp_dat = run_query(vasp_med_query)

In [14]:
## Extract medication data correponding to filtered patient data
med_vasp_dat = med_vasp_dat.loc[med_vasp_dat.patientunitstayid.isin(patientUnitStayIDs),:]

## Vasopressors considered in the analysisare: 'dopamine', 'vasopressin', 'norepinephrine', 'phenylephrine', 'epinephrine', 'angiotensin'
## HICL codes for vasopressors (was taken from: https://github.com/MIT-LCP/eicu-code/blob/master/concepts/pivoted/pivoted-med.sql)
## Note: 'angiotensin' does not appear in eicu medication data and we were unable to find the corrosponding HICL code for 'angiotensin'
hicl_vasp_dict = {'norepinephrine':[37410, 36346, 2051], 'epinephrine':[37407, 39089, 36437, 34361, 2050], 'dopamine': [2060, 2059], 'phenylephrine': [37028, 35517, 35587, 2087], 'vasopressin':[38884, 38883, 2839]}


## User defined function to impute the vasopressor type (drugname column) using HICL codes
## 'return_key' function maps HICL codes to corresponding vasopressor based on the dictionary "hicl_vasp_dict"
def return_key(val):
  for key, value in hicl_vasp_dict.items():
    if val in value:
      return key 

## User defined function to cleam and impute the vasopressor type (drugname column) using HICL codes
## 'clean_drugname' homogenize the vasopressor names in the drugname column
def clean_drugname(name):
  if name is not None:
    if 'norepinephrine' in name.lower():
      return 'norepinephrine'
    elif 'phenylephrine' in name.lower():
      return 'phenylephrine'
    elif 'epinephrine' in name.lower():
      return 'epinephrine'
    elif 'vasopressin' in name.lower():
      return 'vasopressin'
    elif 'dopamine' in name.lower():
      return 'dopamine'
    elif 'angiotensin' in name.lower():
      return 'angiotensin'


## Homogenize the vasopressor names in the drugname column
med_vasp_dat['drugname_cleaned'] = med_vasp_dat['drugname'].apply(clean_drugname)

## impute the vasopressor type (drugname column) using HICL codes
med_vasp_dat.loc[med_vasp_dat.loc[:, 'drugname'].isnull(), 'drugname_cleaned'] = med_vasp_dat.loc[med_vasp_dat.loc[:, 'drugname'].isnull(), 'drughiclseqno'].apply(return_key)

## impute the vasopressor type (drugname column) using HICL codes
med_vasp_dat.loc[med_vasp_dat.loc[:, 'drugname'].isnull(), 'drugname_cleaned'] = med_vasp_dat.loc[med_vasp_dat.loc[:, 'drugname'].isnull(), 'drughiclseqno'].apply(return_key)

## Drop records with missing vasopressor type
med_vasp_dat_new = med_vasp_dat.loc[~med_vasp_dat.drugname_cleaned.isnull(),:].copy()

## Zero drug start/stop times may not be correct as they may be representing nulls 
## Thereore, we remove those
med_vasp_dat_new = med_vasp_dat_new.loc[~((med_vasp_dat_new.drugstartoffset==0) | (med_vasp_dat_new.drugstopoffset==0)),:]

## Drop records with imposible start-stop times
med_vasp_dat_new = med_vasp_dat_new.loc[med_vasp_dat_new.drugstopoffset > med_vasp_dat_new.drugstartoffset, :]

## Drop records of vasopressor administration before ICU
## Note that if drugstartoffset is negative when drugstopoffset is positive, we do not drop those
## We consider the vasopressor administration time from ICU admission to drug stop time to determine 2hour time crieteria for such records
med_vasp_dat_new = med_vasp_dat_new.loc[med_vasp_dat_new.drugstopoffset	> 0,:]

In [15]:
med_vasp_dat_new.head()

Unnamed: 0,medicationid,patientunitstayid,drugstartoffset,drugname,drughiclseqno,routeadmin,frequency,drugstopoffset,gtc,drugname_cleaned
77847,60621647,1695384,351,NOREPINEPHRINE,,IV CONT,INFUSE,1011,0,norepinephrine
78198,59562537,1607519,5335,NOREPINEPHRINE,,IV CONT,INFUSE,6827,0,norepinephrine
78465,57605875,1611927,-2,NOREPINEPHRINE,,IV CONT,X1 M959,145,0,norepinephrine
78537,61089483,1681549,1259,NOREPINEPHRINE,,IV CONT,CoNtInUos,1540,0,norepinephrine
83733,59354402,1643277,9151,NOREPINEPHRINE,,IV CONT,INFUSE,9560,0,norepinephrine


In [16]:
med_vasp_dat_new.routeadmin.unique() ## Note that there are data of which vasopressor routes of administration are not relevant to the study as we consider infusion drugs

array(['IV CONT', 'IR', 'IVPB', 'IV', 'SUBQ', 'IM', '.ROUTE', 'X',
       'CENTRAL IV', 'See Instruct', 'Intravenous', 'Miscellaneou',
       'Each Nostril', 'Intra-articular', 'PERIPH IV', 'Endotracheal',
       'ENDOSCOPY TU', 'INH', 'INHAL', 'NEBULIZER', 'IntraVENOUS', 'SubQ',
       'INTRAVEN', 'IV PUSH', 'IVP', 'Intramuscular', 'SUBCUT',
       'INTRAVEN.',
       'IV (intravenous)                                                                                    ',
       'IVDRIP', 'SQ', 'LVF', 'INTRAVENOUS CONTINUOUS', 'Central Line',
       'IV - continuous infusion (intravenous)                                                              ',
       'INJ', 'SC', 'IV Push', 'SUBCUTAN', 'Subcutaneous', 'Inj',
       'INTRAVENOUS', 'INTRAMUSCULAR', 'IM/SQ', 'SUB-Q', 'I-dermal',
       'INF', 'INTRALESION', 'INTRAVENOU', 'Epid', 'subCUT',
       'IV (injection)                                                                                      ',
       'IJ', 'IVADD', 'SUBCUTANEOU

In [17]:
## Filter data by route of administration
## Note that we selected these routes according to expert opinion after a thorough investigation of data
roa_list = ['INTRAVEN.', 'INF', 'IV', 'IV CONT', 'Intravenous', 'IntraVENOUS', 'INTRAVENOU', 'IVDRIP',  'INTRAVEN',  'INTRAVENOUS CONTINUOUS', 'IV (intravenous)                                                                                    ', 'IV - continuous infusion (intravenous)                                                              ', 'INTRAVENOUS',  'IVPB',  'CENTRAL IV', 'PERIPH IV', 'Central Line',  'IV - continuous infusion (injection)                                                                ', 'INJ', 'Inj']

med_vasp_dat_new = med_vasp_dat_new.loc[med_vasp_dat_new.routeadmin.isin(roa_list), ['patientunitstayid', 'drugstartoffset', 'drugstopoffset', 'drugname_cleaned']]
med_vasp_dat_new.head()

Unnamed: 0,patientunitstayid,drugstartoffset,drugstopoffset,drugname_cleaned
77847,1695384,351,1011,norepinephrine
78198,1607519,5335,6827,norepinephrine
78465,1611927,-2,145,norepinephrine
78537,1681549,1259,1540,norepinephrine
83733,1643277,9151,9560,norepinephrine


In what follows we extract start and stop times of third vasopressor administration. For that we:

(1) Merge overlapping times of same vasopressor administration per ICU stay

(2) Extract the start and stop times for third vasopressor administered continuously at least for 2 hours

In [18]:
## Define a function to merge overlapping/continuous vasopressor records
## Input: "dat_by_icu_stays" is data from "med_vasp_dat_new" grouped by 'patientunitstayid'
## Output: "df" is a dataframe with merged overlapping times per ICU stay
def merge_vasopressors(dat_by_icu_stays):
  df = pd.DataFrame(columns=['patientunitstayid', 'drugstartoffset', 'drugstopoffset', 'drugname_cleaned'])
  vasp_types = list(dat_by_icu_stays.drugname_cleaned.unique())
  for vasp in vasp_types:
    vasp_dat_temp = dat_by_icu_stays.loc[dat_by_icu_stays.drugname_cleaned == vasp, :].copy()
    vasp_dat_temp.sort_values(by=['drugstartoffset', 'drugstopoffset'], ascending=True, inplace=True)
    vasp_dat_temp.reset_index(drop=True, inplace=True)
    df = df.append(vasp_dat_temp.loc[0,:], ignore_index=True)
    for ind in range(1, (vasp_dat_temp.shape[0])):
      start_time_first = df.loc[df.index[-1], 'drugstartoffset']
      stop_time_first = df.loc[df.index[-1], 'drugstopoffset']
      start_time_second = vasp_dat_temp.loc[ind, 'drugstartoffset']
      stop_time_second = vasp_dat_temp.loc[ind, 'drugstopoffset']
      if (start_time_second <= stop_time_first):
        df.loc[df.index[-1], 'drugstopoffset'] = max(stop_time_first, stop_time_second)    
      else:
        df = df.append(vasp_dat_temp.loc[ind,:], ignore_index=True)
  return df

In [19]:
## Merge overlapping vasopressor administrations
merged_dat = med_vasp_dat_new.groupby(by='patientunitstayid').apply(merge_vasopressors)

merged_dat.reset_index(drop=True, inplace=True)
merged_dat = merged_dat.astype({'patientunitstayid': 'int64', 'drugstartoffset': 'int64', 'drugstopoffset': 'int64'})
merged_dat.head()

Unnamed: 0,patientunitstayid,drugstartoffset,drugstopoffset,drugname_cleaned
0,141168,2046,2390,norepinephrine
1,141168,2121,2390,dopamine
2,141288,-8,3100,norepinephrine
3,141432,5,1210,norepinephrine
4,141585,7406,17745,norepinephrine


In [20]:
## Next we use "merged_dat" to extract third vasopressor times
## Note that we would like to extract third vasopressor times of which 
## vasopressors were administered continuously for at least two hours durong the
## ICU stay. Two hour time period starts from the ICU admission.
## Therefore, we make start time of all vasopressors administration began prior to ICU admission zero.
## However, we still want to keep the original start times for later use.
merged_dat.loc[:, 'drugstartoffset_original'] = merged_dat.loc[:, 'drugstartoffset']
## Change drug start time to ICU admission time when the drug start time is prior to ICU admission
merged_dat.loc[merged_dat.drugstartoffset < 0, 'drugstartoffset'] = 0

## Remove data of which vasopressor administration was less than two hours 
merged_dat = merged_dat.loc[(merged_dat.drugstopoffset-merged_dat.drugstartoffset) >= (2*60), :].copy()
merged_dat.head()

Unnamed: 0,patientunitstayid,drugstartoffset,drugstopoffset,drugname_cleaned,drugstartoffset_original
0,141168,2046,2390,norepinephrine,2046
1,141168,2121,2390,dopamine,2121
2,141288,0,3100,norepinephrine,-8
3,141432,5,1210,norepinephrine,5
4,141585,7406,17745,norepinephrine,7406


In [21]:
## Define a function to extract third vasopressor time
## Input: "dat" is data from "merged_dat" grouped by 'patientunitstayid'
## Output: "third_vasp_start_time_dat" is a dataframe with third vasopressor times per ICU stay
def third_vasp(dat):
  third_vasp_start_time_dat = pd.DataFrame(columns=['patientunitstayid', 'drugstartoffset', 'drugstopoffset', 'drugname_cleaned', 'drugstartoffset_original', 'first_vasp_drugstartoffset', 'second_vasp_drugstartoffset'])
  for row_id in range(dat.shape[0]-2):
    dat_temp = dat.iloc[row_id:(row_id+3)].copy()
    if ((dat_temp.drugname_cleaned.nunique() == 3) and ((min(dat_temp.drugstopoffset)-max(dat_temp.drugstartoffset)) >= 120)):
      dat_temp[['first_vasp_drugstartoffset']] = dat_temp.iloc[0, 4]
      dat_temp[['second_vasp_drugstartoffset']] = dat_temp.iloc[1, 4]
      third_vasp_start_time_dat = third_vasp_start_time_dat.append(dat_temp.iloc[2, :], ignore_index=True)
      break;
  return third_vasp_start_time_dat

In [22]:
## Extract third vasopressor times
merged_dat.sort_values(by=['patientunitstayid', 'drugstartoffset', 'drugstopoffset'], ascending=True, inplace=True)
third_vasp_time_dat = merged_dat.groupby(by='patientunitstayid').apply(third_vasp)
third_vasp_time_dat = third_vasp_time_dat.astype({'patientunitstayid': 'int64', 'drugstartoffset': 'int64', 'drugstopoffset': 'int64', 'drugstartoffset_original':  'int64', 'first_vasp_drugstartoffset': 'int64', 'second_vasp_drugstartoffset': 'int64'})
third_vasp_time_dat.reset_index(drop=True, inplace=True)
third_vasp_time_dat.head()

Unnamed: 0,patientunitstayid,drugstartoffset,drugstopoffset,drugname_cleaned,drugstartoffset_original,first_vasp_drugstartoffset,second_vasp_drugstartoffset
0,144173,107,390,vasopressin,107,47,92
1,145394,2982,10012,phenylephrine,2982,222,2982
2,146133,1663,2039,dopamine,1663,43,268
3,146349,677,3286,phenylephrine,677,497,497
4,147985,369,3009,norepinephrine,369,174,279


In [23]:
## Check for any third vasopressor starting time prior to ICU admission
print(sum(third_vasp_time_dat.drugstartoffset_original<0))

118


In [24]:
## Remove third vasopressor start times prior to ICU admission
third_vasp_time_dat = third_vasp_time_dat.loc[third_vasp_time_dat.drugstartoffset_original>=0,:]

In [25]:
## Rename columns of third vasopressor times
third_vasp_time_dat.rename(columns={'drugstartoffset': 'third_vasp_drugstartoffset', 'drugstopoffset': 'third_vasp_drugstopoffset', 'drugstartoffset_original': 'third_vasp_drugstartoffset_original', 'drugname_cleaned': 'drugname'}, inplace=True)
third_vasp_time_dat.head()

Unnamed: 0,patientunitstayid,third_vasp_drugstartoffset,third_vasp_drugstopoffset,drugname,third_vasp_drugstartoffset_original,first_vasp_drugstartoffset,second_vasp_drugstartoffset
0,144173,107,390,vasopressin,107,47,92
1,145394,2982,10012,phenylephrine,2982,222,2982
2,146133,1663,2039,dopamine,1663,43,268
3,146349,677,3286,phenylephrine,677,497,497
4,147985,369,3009,norepinephrine,369,174,279


# Clean Ethnicity Information

In [28]:
## Demographic info for third vasopressor patients
patient_dat_new = patient_dat_new.loc[patient_dat_new.patientUnitStayID.isin(third_vasp_time_dat.patientunitstayid), :]
patient_dat_new.head()

Unnamed: 0,uniquepid,patientHealthSystemStayID,patientUnitStayID,uniquepid_1,gender,age,ethnicity,unitType,hospitalDischargeLocation,hospitalDischargeStatus,age_int
1468,006-24923,645462,858255,006-24923,Male,76,Caucasian,Med-Surg ICU,Skilled Nursing Facility,Alive,76
2296,006-249436,663463,888281,006-249436,Male,78,Caucasian,Med-Surg ICU,Other External,Alive,78
2304,006-6800,676864,910583,006-6800,Female,69,Caucasian,Med-Surg ICU,Death,Expired,69
2491,006-202090,591475,768433,006-202090,Male,58,Asian,Med-Surg ICU,Home,Alive,58
3712,006-86051,471667,568682,006-86051,Female,78,Caucasian,Med-Surg ICU,Death,Expired,78


In [31]:
(patient_dat_new.groupby('uniquepid')['ethnicity'].nunique()>1).sum() ## There is no patient with multiple ethnicies assigned

0

In [33]:
## Number of patients with missing ethinicity information for the ICU stay that was selected under third vasopressor crieteria
## Note that the ethnicity info of these ICU stays can be found from these patient's other available ICU stays in "patient_dat"
patient_dat_new.ethnicity.isnull().sum() 

1

In [63]:
## 'patientUnitStayID' of missing ethnicity info
pid_missing_ethnicity = int(patient_dat_new.patientUnitStayID[patient_dat_new.ethnicity.isnull()])
pid_missing_ethnicity

3242563

In [70]:
## Unique number of ethnicity types assigned for the patient with missing ethnicity  in his/her other ICU stays 
patient_dat.loc[(patient_dat.uniquepid.isin(patient_dat_new.loc[patient_dat_new.ethnicity.isnull(),'uniquepid']) & (patient_dat.patientUnitStayID != pid_missing_ethnicity)),'ethnicity'].nunique()

1

In [82]:
## Impute missing ethnicity extracted from other ICU stays
patient_dat_new.loc[patient_dat_new.ethnicity.isnull(),'ethnicity'] = patient_dat.loc[(patient_dat.uniquepid.isin(patient_dat_new.loc[patient_dat_new.ethnicity.isnull(),'uniquepid']) & (patient_dat.patientUnitStayID != pid_missing_ethnicity)),'ethnicity'].values

In [88]:
## Create a new column with dummy coding ethnicity as Caucasian = 1, and 0 otherwise
patient_dat_new['ethnicity_cat'] = patient_dat_new['ethnicity'].apply(lambda x: 1 if x=='Caucasian' else 0)

# Extract and clean labs data
### We consider two labs in our analysis: Creatinine (blood test) and Lactate

In [None]:
# Extract relevant lab data
labs_dat_query = \
"""
SELECT *
FROM `physionet-data.eicu_crd.lab` io
WHERE (io.labname is null OR io.labname LIKE '%lactate%' OR io.labname LIKE '%creatinine%')
"""

labs_dat = run_query(labs_dat_query)

In [None]:
labs_dat.head()

Unnamed: 0,labid,patientunitstayid,labresultoffset,labtypeid,labname,labresult,labresulttext,labmeasurenamesystem,labmeasurenameinterface,labresultrevisedoffset
0,95864735,353695,6687,4,urinary creatinine,20.4,20.4,mg/dL,mg/dL,7146
1,58333223,200922,26235,4,urinary creatinine,36.01,36.01,mg/dL,mg/dL,26317
2,59220535,227756,494,4,urinary creatinine,70.36,70.36,mg/dL,mg/dL,2716
3,78311710,248586,1385,4,urinary creatinine,105.1,105.1,mg/dL,mg/dL,2222
4,97584419,317857,787,4,urinary creatinine,247.2,247.2,mg/dL,mg/dL,1201


In [None]:
labs_dat.labname.unique() ## Note that we have extracted both blood and urine creatinine

array(['urinary creatinine', 'lactate', 'creatinine'], dtype=object)

In [None]:
## Note that there are more additional lab values present in 'labresulttext' column.
## Therefore we consider 'labresulttext' column to extract values.
print("missings in text lab values:" + str(labs_dat.labresulttext.isnull().sum()))
print("missings in numeric lab values:" + str(labs_dat.labresult.isnull().sum()))
print("Difference in lab values (text vs numeric):" + str(sum(labs_dat.loc[~labs_dat.labresult.isnull(), "labresult"] -labs_dat.loc[~labs_dat.labresult.isnull(), "labresulttext"].astype('float'))))

labs_dat.drop(['labresult', 'labmeasurenameinterface', 'labid', 'labtypeid'], axis = 1, inplace=True)
labs_dat.head()

missings in text lab values:0
missings in numeric lab values:3008
Difference in lab values (text vs numeric):0.0


Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem,labresultrevisedoffset
0,353695,6687,urinary creatinine,20.4,mg/dL,7146
1,200922,26235,urinary creatinine,36.01,mg/dL,26317
2,227756,494,urinary creatinine,70.36,mg/dL,2716
3,248586,1385,urinary creatinine,105.1,mg/dL,2222
4,317857,787,urinary creatinine,247.2,mg/dL,1201


##### In what follows we extract nearest lab value drawn within +/- 12 hours from the third vasopressor starting time. In order to do that we:

  ##### (1) Filter out all the lab values that were not drawn within +/- 12 hours from the third vasopressor starting time.

  ##### (2) We extract the lab value drawn at a closest time to third vasopressor starting time, giving the priority for values drawn prior to third vasopressor starting time. In otherwords, we obtain lab values drawn after third vasopressor starting time only for the cases where there is no available lab value drawn prior to third vasopressor starting time.

In [None]:
## Funcion "grouped_dat" extract closest lab values drawn to the third vasopressor starting time
## Input: grouped lab data by 'patientunitstayid'
##        Input data should have labs values only within +/- 12 hours from the third vasopressor starting time.
##        Input data should also have a column called 'time_diff' which contains the time offset of the third vasopressor start time from the time the lab value was drawn 
## Output: Returns the lab value drawn closest to the third vasopressor starting time. Note that we give priority for lab values drawn prior to the third vasopressor starting time
def filter_labs(grouped_dat):
  if (any(grouped_dat.time_diff < 0)):
    return(grouped_dat.loc[grouped_dat.loc[grouped_dat.time_diff < 0, 'time_diff'].idxmax(axis="columns"), 'labresulttext'])
  else:
    return(grouped_dat.loc[grouped_dat.loc[grouped_dat.time_diff >= 0, 'time_diff'].idxmin(axis="columns"), 'labresulttext'])

### Clean creatinine lab data

In [None]:
## We only consider blood creatinine in our analysis
labs_dat_creatinine = labs_dat.loc[labs_dat.labname == 'creatinine',:].copy() 

## Check if all values are meassured with same units
print("units of blood creatinine:"+ str(labs_dat_creatinine.labmeasurenamesystem.unique()))

## Some lab values were revised at a later time given in the column 'labresultrevisedoffset'.
## Therefore, we take the most recently revised lab value for the records where lab values were drawn exactly at the same time
labs_dat_creatinine["rank"] = labs_dat_creatinine.groupby(['patientunitstayid',	'labresultoffset'])['labresultrevisedoffset'].rank("first", ascending=False)
labs_dat_creatinine = labs_dat_creatinine.loc[labs_dat_creatinine['rank']==1,:].drop(['rank','labresultrevisedoffset'], axis = 1)

# Merge labs data with vasopressor data
labs_dat_creatinine_new = third_vasp_time_dat.merge(labs_dat_creatinine, how='left', on='patientunitstayid')

# Create a column with time offset of the third vasopressor starting time to the time when the lab was drawn
labs_dat_creatinine_new['time_diff'] = labs_dat_creatinine_new['labresultoffset']-labs_dat_creatinine_new['third_vasp_drugstartoffset']

## Filtering out lab values that were not drawn within +/- 12 hours from third vasopressor starting time
labs_dat_creatinine_new['lower_bound'] = labs_dat_creatinine_new['third_vasp_drugstartoffset'] - 720
labs_dat_creatinine_new['upper_bound'] = labs_dat_creatinine_new['third_vasp_drugstartoffset'] + 720
labs_dat_creatinine_new = labs_dat_creatinine_new.loc[(labs_dat_creatinine_new.labresultoffset>=labs_dat_creatinine_new.lower_bound) & (labs_dat_creatinine_new.labresultoffset<=labs_dat_creatinine_new.upper_bound), :]

## Extract nearest creatinine value to the third vasopressor starting time
creatinine_val_dat = labs_dat_creatinine_new.groupby(['patientunitstayid']).apply(filter_labs).reset_index(name='creatinine')


units of blood creatinine:['mg/dL']


In [None]:
## There are non-numeric creatinine values. Therefore, we impute non-numeric creatinine lab values
creatinine_val_dat['creatinine_imputed'] = pd.to_numeric(creatinine_val_dat.creatinine, errors='coerce')
creatinine_val_dat.loc[np.isnan(pd.to_numeric(creatinine_val_dat.creatinine, errors='coerce')), 'creatinine_imputed']=creatinine_val_dat.loc[np.isnan(pd.to_numeric(creatinine_val_dat.creatinine, errors='coerce')),'creatinine'].apply(lambda x: x[1:])

creatinine_val_dat[np.isnan(pd.to_numeric(creatinine_val_dat.creatinine, errors='coerce'))] 

Unnamed: 0,patientunitstayid,creatinine,creatinine_imputed
893,1346834,<0.30,0.3
1249,2305583,<0.40,0.4
1259,2403553,<0.4,0.4


### Clean lactate lab data

In [None]:
## Lactate lab values
labs_dat_lactate = labs_dat.loc[labs_dat.labname == 'lactate',:].copy() 

## Check if all values are meassured with same units
print("units of lactate:"+ str(labs_dat_lactate.labmeasurenamesystem.unique()))

## Some lab values were revised at a later time given in the column 'labresultrevisedoffset'.
## Therefore, we take the most recently revised lab value for the records where lab values were drawn exactly at the same time
labs_dat_lactate["rank"] = labs_dat_lactate.groupby(['patientunitstayid',	'labresultoffset'])['labresultrevisedoffset'].rank("first", ascending=False)
labs_dat_lactate = labs_dat_lactate.loc[labs_dat_lactate['rank']==1,:].drop(['rank','labresultrevisedoffset'], axis = 1)

# Merge labs data with vasopressor data
labs_dat_lactate_new = third_vasp_time_dat.merge(labs_dat_lactate, how='left', on='patientunitstayid')

# Create a column with time offset of the third vasopressor starting time to the time when the lab was drawn
labs_dat_lactate_new['time_diff'] = labs_dat_lactate_new['labresultoffset']-labs_dat_lactate_new['third_vasp_drugstartoffset']

## Filtering out lab values that were not drawn within +/- 12 hours from third vasopressor starting time
labs_dat_lactate_new['lower_bound'] = labs_dat_lactate_new['third_vasp_drugstartoffset'] - 720
labs_dat_lactate_new['upper_bound'] = labs_dat_lactate_new['third_vasp_drugstartoffset'] + 720
labs_dat_lactate_new = labs_dat_lactate_new.loc[(labs_dat_lactate_new.labresultoffset>=labs_dat_lactate_new.lower_bound) & (labs_dat_lactate_new.labresultoffset<=labs_dat_lactate_new.upper_bound), :]

## Extract nearest lactate value to the third vasopressor starting time
lactate_val_dat = labs_dat_lactate_new.groupby(['patientunitstayid']).apply(filter_labs).reset_index(name='lactate')


units of lactate:['mmol/L']


In [None]:
## There are non-numeric lactate values. Therefore, we impute non-numeric lactate lab values
lactate_val_dat['lactate_imputed'] = pd.to_numeric(lactate_val_dat.lactate, errors='coerce')
lactate_val_dat.loc[np.isnan(pd.to_numeric(lactate_val_dat.lactate, errors='coerce')), 'lactate_imputed']=lactate_val_dat.loc[np.isnan(pd.to_numeric(lactate_val_dat.lactate, errors='coerce')),'lactate'].apply(lambda x: x[1:])

lactate_val_dat[np.isnan(pd.to_numeric(lactate_val_dat.lactate, errors='coerce'))] 

Unnamed: 0,patientunitstayid,lactate,lactate_imputed
96,334333,>13.3,13.3
98,394202,>26.6,26.6
205,644624,>12.0,12.0
233,678714,>20.0,20.0
481,1060148,>17.75,17.75
771,1513194,>20.00,20.0
913,2359321,>12.2,12.2
1029,3169225,<0.4,0.4
1032,3176837,>15.0,15.0
1033,3178924,>15.0,15.0


In [None]:
## Merge all labs data 
labs_final = creatinine_val_dat.merge(lactate_val_dat, how='inner', on='patientunitstayid')

In [None]:
patient_dat_new.head()

Unnamed: 0,patientHealthSystemStayID,patientUnitStayID,uniquepid,gender,age,ethnicity,unitType,hospitalDischargeLocation,hospitalDischargeStatus,age_int
0,844848,1137569,011-43764,Female,75,Other/Unknown,MICU,Death,Expired,75
1,2462651,3036927,030-57208,Male,51,Caucasian,Med-Surg ICU,Home,Alive,51
2,2481889,3058863,030-13634,Female,31,Caucasian,Med-Surg ICU,Home,Alive,31
3,2494105,3072720,030-53536,Male,58,Caucasian,Med-Surg ICU,Home,Alive,58
4,2496542,3075429,030-59276,Female,68,Caucasian,Med-Surg ICU,Death,Expired,68
