<a href="https://colab.research.google.com/github/rmadushani/Three_vasopressor_problem/blob/main/codes/eICU_Three_Vasopressors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
###
# @author: R. W. M. A. Madushani
# Created on Dec 20, 2021
###

In [2]:
# Imports for accessing eICU data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# Import libraries
import os
import pandas as pd
import numpy as np

In [3]:
auth.authenticate_user()

In [4]:
project_id = 'sccm-datathon'
os.environ['GOOGLE_CLOUD_PROJECT'] = project_id

In [5]:
# Read data from BigQuery into pandas dataframes.
def run_query(query):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# Extract and clean demographic data

In [6]:
# Extract relevant patient data
patient_dat_query = \
"""
SELECT uniquepid, patientHealthSystemStayID, patientUnitStayID, gender, age, ethnicity, unitType, hospitalDischargeLocation, hospitalDischargeStatus, hospitalAdmitOffset, unitVisitNumber
FROM `physionet-data.eicu_crd.patient`
"""

patient_dat = run_query(patient_dat_query)

## We consider only 'MICU', and 'Med-Surg ICU'
patient_dat_new = patient_dat.loc[(patient_dat.unitType=='MICU') | (patient_dat.unitType=='Med-Surg ICU'),: ].copy()

## Convert age>89 strings to numeric to extract patients with age>=18
patient_dat_new['age_int'] = patient_dat_new['age'].apply(lambda x: 90 if x=='> 89' else (-999 if x=='' else int(x)))
## Filter patients with age >= 18
patient_dat_new = patient_dat_new.loc[patient_dat_new.age_int >= 18, :]

## Remove records with missing hospital discharge status i.e., mortality
patient_dat_new = patient_dat_new.loc[patient_dat_new.hospitalDischargeStatus!='', :]
patient_dat_new.head()

Unnamed: 0,uniquepid,patientHealthSystemStayID,patientUnitStayID,gender,age,ethnicity,unitType,hospitalDischargeLocation,hospitalDischargeStatus,hospitalAdmitOffset,unitVisitNumber,age_int
0,011-43764,844848,1137569,Female,75,Other/Unknown,MICU,Death,Expired,-36707,2,75
1,030-57208,2462651,3036927,Male,51,Caucasian,Med-Surg ICU,Home,Alive,-240,1,51
2,030-13634,2481889,3058863,Female,31,Caucasian,Med-Surg ICU,Home,Alive,-292,1,31
3,030-53536,2494105,3072720,Male,58,Caucasian,Med-Surg ICU,Home,Alive,0,1,58
4,030-59276,2496542,3075429,Female,68,Caucasian,Med-Surg ICU,Death,Expired,-1,1,68


In [7]:
## Ethnicity types
patient_dat_new.ethnicity.unique()

array(['Other/Unknown', 'Caucasian', 'Hispanic', 'African American',
       'Asian', '', 'Native American'], dtype=object)

In [8]:
## Convert missing ethnicity string '' to none
patient_dat_new.loc[patient_dat_new.ethnicity=='', 'ethnicity'] = None

In [9]:
(patient_dat_new.groupby('uniquepid')['ethnicity'].nunique()>1).sum() ## There are 359 patients with multiple ethnicies assigned

359

In [10]:
## Find patients with no ethnicity information available
ethnicity_missing = patient_dat_new.groupby('uniquepid')['ethnicity'].apply(lambda x: x.isnull().values.all())
ethnicity_missing.head()

uniquepid
002-10009    False
002-10018    False
002-10034    False
002-10050    False
002-10052    False
Name: ethnicity, dtype: bool

In [11]:
## Remove patients who don't have any ethnicity information available for any of their ICU stays (There were 1348 unique such patients)
## We clean ethnicity infomation by assigning a unique ethnicity for each patient after we extract our cohort of third vasopressor patients
patient_dat_new = patient_dat_new.loc[patient_dat_new.uniquepid.isin(ethnicity_missing [~ethnicity_missing].index),:]

In [12]:
## List of ICU stays ids to extract other variables
patientUnitStayIDs = list(patient_dat_new.patientUnitStayID)

# Extract and clean vasopressor medication data

In [13]:
# Extract vasopressor related medication data
vasp_med_query = \
"""
SELECT medicationid, patientunitstayid, drugstartoffset, drugname, drughiclseqno, routeadmin, frequency, drugstopoffset, gtc
FROM `physionet-data.eicu_crd.medication`
WHERE (drugname is null OR (UPPER(drugname) like '%EPINEPHRIN%') OR (UPPER(drugname) LIKE '%ADRENALIN%') OR (UPPER(drugname) LIKE '%PHENYLEPHRIN%') OR (UPPER(drugname) LIKE '%VASOPRESSIN%') OR (UPPER(drugname) like '%DOPAMINE%') OR (UPPER(drugname) like '%ANGIOTENSIN%')) 
"""

med_vasp_dat = run_query(vasp_med_query)

In [14]:
## Extract medication data correponding to filtered patient data
med_vasp_dat = med_vasp_dat.loc[med_vasp_dat.patientunitstayid.isin(patientUnitStayIDs),:]

## Vasopressors considered in the analysisare: 'dopamine', 'vasopressin', 'norepinephrine', 'phenylephrine', 'epinephrine', 'angiotensin'
## HICL codes for vasopressors (Taken from: https://github.com/MIT-LCP/eicu-code/blob/master/concepts/pivoted/pivoted-med.sql)
## Note: 'angiotensin' does not appear in eicu medication data and we were unable to find the corrosponding HICL code for 'angiotensin'
hicl_vasp_dict = {'norepinephrine':[37410, 36346, 2051], 'epinephrine':[37407, 39089, 36437, 34361, 2050], 'dopamine': [2060, 2059], 'phenylephrine': [37028, 35517, 35587, 2087], 'vasopressin':[38884, 38883, 2839]}


## User defined function to impute the vasopressor type (drugname column) using HICL codes
## 'return_key' function maps HICL codes to corresponding vasopressor based on the dictionary "hicl_vasp_dict"
def return_key(val):
  for key, value in hicl_vasp_dict.items():
    if val in value:
      return key 

## User defined function to cleam and impute the vasopressor type (drugname column) using HICL codes
## 'clean_drugname' homogenize the vasopressor names in the drugname column
def clean_drugname(name):
  if name is not None:
    if 'norepinephrine' in name.lower():
      return 'norepinephrine'
    elif 'phenylephrine' in name.lower():
      return 'phenylephrine'
    elif 'epinephrine' in name.lower():
      return 'epinephrine'
    elif 'vasopressin' in name.lower():
      return 'vasopressin'
    elif 'dopamine' in name.lower():
      return 'dopamine'
    elif 'angiotensin' in name.lower():
      return 'angiotensin'


## Homogenize the vasopressor names in the drugname column
med_vasp_dat['drugname_cleaned'] = med_vasp_dat['drugname'].apply(clean_drugname)

## impute the vasopressor type (drugname column) using HICL codes
med_vasp_dat.loc[med_vasp_dat.loc[:, 'drugname'].isnull(), 'drugname_cleaned'] = med_vasp_dat.loc[med_vasp_dat.loc[:, 'drugname'].isnull(), 'drughiclseqno'].apply(return_key)

## Drop records with missing vasopressor type
med_vasp_dat_new = med_vasp_dat.loc[~med_vasp_dat.drugname_cleaned.isnull(),:].copy()

## Zero drug start/stop times may not be correct as they may be representing nulls 
## Thereore, we remove those
med_vasp_dat_new = med_vasp_dat_new.loc[~((med_vasp_dat_new.drugstartoffset==0) | (med_vasp_dat_new.drugstopoffset==0)),:]

## Drop records with imposible start-stop times
med_vasp_dat_new = med_vasp_dat_new.loc[med_vasp_dat_new.drugstopoffset > med_vasp_dat_new.drugstartoffset, :]

## Drop records of vasopressor administration before ICU
## Note that if drugstartoffset is negative when drugstopoffset is positive, we do not drop those
## We consider the vasopressor administration time from ICU admission to drug stop time to determine 2hour time crieteria for such records
med_vasp_dat_new = med_vasp_dat_new.loc[med_vasp_dat_new.drugstopoffset	> 0,:]

In [15]:
med_vasp_dat_new.head()

Unnamed: 0,medicationid,patientunitstayid,drugstartoffset,drugname,drughiclseqno,routeadmin,frequency,drugstopoffset,gtc,drugname_cleaned
77847,60621647,1695384,351,NOREPINEPHRINE,,IV CONT,INFUSE,1011,0,norepinephrine
78198,59562537,1607519,5335,NOREPINEPHRINE,,IV CONT,INFUSE,6827,0,norepinephrine
78465,57605875,1611927,-2,NOREPINEPHRINE,,IV CONT,X1 M959,145,0,norepinephrine
78537,61089483,1681549,1259,NOREPINEPHRINE,,IV CONT,CoNtInUos,1540,0,norepinephrine
83733,59354402,1643277,9151,NOREPINEPHRINE,,IV CONT,INFUSE,9560,0,norepinephrine


In [16]:
med_vasp_dat_new.routeadmin.unique() ## Note that there are data of which vasopressor routes of administration are not relevant to the study as we consider infusion drugs

array(['IV CONT', 'IR', 'IVPB', 'IV', 'SUBQ', 'IM', 'Miscellaneou',
       'Each Nostril', 'CENTRAL IV', '.ROUTE', 'X', 'See Instruct',
       'Intravenous', 'Intra-articular', 'PERIPH IV', 'Endotracheal',
       'ENDOSCOPY TU', 'INH', 'INHAL', 'NEBULIZER', 'IntraVENOUS', 'SubQ',
       'INTRAVEN', 'IV PUSH', 'IVP', 'Intramuscular', 'SUBCUT',
       'INTRAVEN.',
       'IV (intravenous)                                                                                    ',
       'IVDRIP', 'SQ', 'LVF', 'INTRAVENOUS CONTINUOUS', 'Central Line',
       'IV - continuous infusion (intravenous)                                                              ',
       'INJ', 'SC', 'IV Push', 'SUBCUTAN', 'Subcutaneous', 'Inj',
       'INTRAVENOUS', 'INTRAMUSCULAR', 'IM/SQ', 'SUB-Q', 'subCUT',
       'IV (injection)                                                                                      ',
       'IJ', 'I-dermal', 'INF', 'INTRALESION', 'INTRAVENOU', 'Epid',
       'IVADD', 'SUBCUTANEOU

In [17]:
## Filter data by route of administration
## Note that we selected these routes according to expert opinion after a thorough investigation of data
roa_list = ['INTRAVEN.', 'INF', 'IV', 'IV CONT', 'Intravenous', 'IntraVENOUS', 'INTRAVENOU', 'IVDRIP',  'INTRAVEN',  'INTRAVENOUS CONTINUOUS', 'IV (intravenous)                                                                                    ', 'IV - continuous infusion (intravenous)                                                              ', 'INTRAVENOUS',  'IVPB',  'CENTRAL IV', 'PERIPH IV', 'Central Line',  'IV - continuous infusion (injection)                                                                ', 'INJ', 'Inj']

med_vasp_dat_new = med_vasp_dat_new.loc[med_vasp_dat_new.routeadmin.isin(roa_list), ['patientunitstayid', 'drugstartoffset', 'drugstopoffset', 'drugname_cleaned']]
med_vasp_dat_new.head()

Unnamed: 0,patientunitstayid,drugstartoffset,drugstopoffset,drugname_cleaned
77847,1695384,351,1011,norepinephrine
78198,1607519,5335,6827,norepinephrine
78465,1611927,-2,145,norepinephrine
78537,1681549,1259,1540,norepinephrine
83733,1643277,9151,9560,norepinephrine


In what follows we extract start and stop times of third vasopressor administration. For that we:

(1) Merge overlapping times of same vasopressor administration per ICU stay

(2) Extract the start and stop times for third vasopressor administered continuously at least for 2 hours

In [18]:
## Define a function to merge overlapping/continuous vasopressor records
## Input: "dat_by_icu_stays" is data from "med_vasp_dat_new" grouped by 'patientunitstayid'
## Output: "df" is a dataframe with merged overlapping times per ICU stay
def merge_vasopressors(dat_by_icu_stays):
  df = pd.DataFrame(columns=['patientunitstayid', 'drugstartoffset', 'drugstopoffset', 'drugname_cleaned'])
  vasp_types = list(dat_by_icu_stays.drugname_cleaned.unique())
  for vasp in vasp_types:
    vasp_dat_temp = dat_by_icu_stays.loc[dat_by_icu_stays.drugname_cleaned == vasp, :].copy()
    vasp_dat_temp.sort_values(by=['drugstartoffset', 'drugstopoffset'], ascending=True, inplace=True)
    vasp_dat_temp.reset_index(drop=True, inplace=True)
    df = df.append(vasp_dat_temp.loc[0,:], ignore_index=True)
    for ind in range(1, (vasp_dat_temp.shape[0])):
      start_time_first = df.loc[df.index[-1], 'drugstartoffset']
      stop_time_first = df.loc[df.index[-1], 'drugstopoffset']
      start_time_second = vasp_dat_temp.loc[ind, 'drugstartoffset']
      stop_time_second = vasp_dat_temp.loc[ind, 'drugstopoffset']
      if (start_time_second <= stop_time_first):
        df.loc[df.index[-1], 'drugstopoffset'] = max(stop_time_first, stop_time_second)    
      else:
        df = df.append(vasp_dat_temp.loc[ind,:], ignore_index=True)
  return df

In [19]:
## Merge overlapping vasopressor administrations
merged_dat = med_vasp_dat_new.groupby(by='patientunitstayid').apply(merge_vasopressors)

merged_dat.reset_index(drop=True, inplace=True)
merged_dat = merged_dat.astype({'patientunitstayid': 'int64', 'drugstartoffset': 'int64', 'drugstopoffset': 'int64'})
merged_dat.head()

Unnamed: 0,patientunitstayid,drugstartoffset,drugstopoffset,drugname_cleaned
0,141168,2046,2390,norepinephrine
1,141168,2121,2390,dopamine
2,141288,-8,3100,norepinephrine
3,141432,5,1210,norepinephrine
4,141585,7406,17745,norepinephrine


In [20]:
## Next we use "merged_dat" to extract third vasopressor times
## Note that we would like to extract third vasopressor times of which 
## vasopressors were administered continuously for at least two hours durong the
## ICU stay. Two hour time period starts from the ICU admission.
## Therefore, we make start time of all vasopressors administration began prior to ICU admission zero.
## However, we still want to keep the original start times for later use.
merged_dat.loc[:, 'drugstartoffset_original'] = merged_dat.loc[:, 'drugstartoffset']
## Change drug start time to ICU admission time when the drug start time is prior to ICU admission
merged_dat.loc[merged_dat.drugstartoffset < 0, 'drugstartoffset'] = 0

## Remove data of which vasopressor administration was less than two hours 
merged_dat = merged_dat.loc[(merged_dat.drugstopoffset-merged_dat.drugstartoffset) >= (2*60), :].copy()
merged_dat.head()

Unnamed: 0,patientunitstayid,drugstartoffset,drugstopoffset,drugname_cleaned,drugstartoffset_original
0,141168,2046,2390,norepinephrine,2046
1,141168,2121,2390,dopamine,2121
2,141288,0,3100,norepinephrine,-8
3,141432,5,1210,norepinephrine,5
4,141585,7406,17745,norepinephrine,7406


In [21]:
## Define a function to extract third vasopressor time
## Input: "dat" is data from "merged_dat" grouped by 'patientunitstayid'
## Output: "third_vasp_start_time_dat" is a dataframe with third vasopressor times per ICU stay
def third_vasp(dat):
  third_vasp_start_time_dat = pd.DataFrame(columns=['patientunitstayid', 'drugstartoffset', 'drugstopoffset', 'drugname_cleaned', 'drugstartoffset_original', 'first_vasp_drugstartoffset', 'second_vasp_drugstartoffset', 'drugname_first_vasp', 'drugname_second_vasp'])
  for row_id in range(dat.shape[0]-2):
    dat_temp = dat.iloc[row_id:(row_id+3)].copy()
    if ((dat_temp.drugname_cleaned.nunique() == 3) and ((min(dat_temp.drugstopoffset)-max(dat_temp.drugstartoffset)) >= 120)):
      dat_temp[['first_vasp_drugstartoffset']] = dat_temp.iloc[0, 4]
      dat_temp[['second_vasp_drugstartoffset']] = dat_temp.iloc[1, 4]
      dat_temp[['drugname_first_vasp']] = dat_temp.iloc[0, 3]
      dat_temp[['drugname_second_vasp']] = dat_temp.iloc[1, 3]
      third_vasp_start_time_dat = third_vasp_start_time_dat.append(dat_temp.iloc[2, :], ignore_index=True)
      break;
  return third_vasp_start_time_dat

In [22]:
## Extract third vasopressor times
merged_dat.sort_values(by=['patientunitstayid', 'drugstartoffset', 'drugstopoffset'], ascending=True, inplace=True)
third_vasp_time_dat = merged_dat.groupby(by='patientunitstayid').apply(third_vasp)
third_vasp_time_dat = third_vasp_time_dat.astype({'patientunitstayid': 'int64', 'drugstartoffset': 'int64', 'drugstopoffset': 'int64', 'drugstartoffset_original':  'int64', 'first_vasp_drugstartoffset': 'int64', 'second_vasp_drugstartoffset': 'int64'})
third_vasp_time_dat.reset_index(drop=True, inplace=True)
third_vasp_time_dat.head()

Unnamed: 0,patientunitstayid,drugstartoffset,drugstopoffset,drugname_cleaned,drugstartoffset_original,first_vasp_drugstartoffset,second_vasp_drugstartoffset,drugname_first_vasp,drugname_second_vasp
0,144173,107,390,vasopressin,107,47,92,norepinephrine,phenylephrine
1,145394,2982,10012,phenylephrine,2982,222,2982,norepinephrine,vasopressin
2,146133,1663,2039,dopamine,1663,43,268,norepinephrine,vasopressin
3,146349,677,3286,phenylephrine,677,497,497,vasopressin,norepinephrine
4,147985,369,3009,norepinephrine,369,174,279,vasopressin,epinephrine


In [23]:
## Check for any third vasopressor starting time prior to ICU admission
print(sum(third_vasp_time_dat.drugstartoffset_original<0))

118


In [24]:
## Remove third vasopressor start times prior to ICU admission
third_vasp_time_dat = third_vasp_time_dat.loc[third_vasp_time_dat.drugstartoffset_original>=0,:]

In [25]:
## Rename columns of third vasopressor times
third_vasp_time_dat.rename(columns={'drugstartoffset': 'third_vasp_drugstartoffset', 'drugstopoffset': 'third_vasp_drugstopoffset', 'drugstartoffset_original': 'third_vasp_drugstartoffset_original', 'drugname_cleaned': 'drugname'}, inplace=True)
third_vasp_time_dat.head()

Unnamed: 0,patientunitstayid,third_vasp_drugstartoffset,third_vasp_drugstopoffset,drugname,third_vasp_drugstartoffset_original,first_vasp_drugstartoffset,second_vasp_drugstartoffset,drugname_first_vasp,drugname_second_vasp
0,144173,107,390,vasopressin,107,47,92,norepinephrine,phenylephrine
1,145394,2982,10012,phenylephrine,2982,222,2982,norepinephrine,vasopressin
2,146133,1663,2039,dopamine,1663,43,268,norepinephrine,vasopressin
3,146349,677,3286,phenylephrine,677,497,497,vasopressin,norepinephrine
4,147985,369,3009,norepinephrine,369,174,279,vasopressin,epinephrine


# Clean Ethnicity Information

In [26]:
## Demographic info for third vasopressor patients
patient_dat_new = patient_dat_new.loc[patient_dat_new.patientUnitStayID.isin(third_vasp_time_dat.patientunitstayid), :]
patient_dat_new.head()

Unnamed: 0,uniquepid,patientHealthSystemStayID,patientUnitStayID,gender,age,ethnicity,unitType,hospitalDischargeLocation,hospitalDischargeStatus,hospitalAdmitOffset,unitVisitNumber,age_int
1468,006-24923,645462,858255,Male,76,Caucasian,Med-Surg ICU,Skilled Nursing Facility,Alive,-1868,2,76
2296,006-249436,663463,888281,Male,78,Caucasian,Med-Surg ICU,Other External,Alive,-5224,3,78
2304,006-6800,676864,910583,Female,69,Caucasian,Med-Surg ICU,Death,Expired,-588,2,69
2491,006-202090,591475,768433,Male,58,Asian,Med-Surg ICU,Home,Alive,-1765,2,58
3712,006-86051,471667,568682,Female,78,Caucasian,Med-Surg ICU,Death,Expired,-611,2,78


In [27]:
(patient_dat_new.groupby('uniquepid')['ethnicity'].nunique()>1).sum() ## There is no patient with multiple ethnicies assigned

0

In [28]:
## Number of patients with missing ethinicity information for the ICU stay that was selected under third vasopressor crieteria
## Note that the ethnicity info of these ICU stays can be found from these patient's other available ICU stays in "patient_dat"
patient_dat_new.ethnicity.isnull().sum() 

1

In [29]:
## 'patientUnitStayID' of missing ethnicity info
pid_missing_ethnicity = int(patient_dat_new.patientUnitStayID[patient_dat_new.ethnicity.isnull()])
pid_missing_ethnicity

3242563

In [30]:
## Unique number of ethnicity types assigned for the patient with missing ethnicity  in his/her other ICU stays 
patient_dat.loc[(patient_dat.uniquepid.isin(patient_dat_new.loc[patient_dat_new.ethnicity.isnull(),'uniquepid']) & (patient_dat.patientUnitStayID != pid_missing_ethnicity)),'ethnicity'].nunique()

1

In [31]:
## Impute missing ethnicity extracted from other ICU stays
patient_dat_new.loc[patient_dat_new.ethnicity.isnull(),'ethnicity'] = patient_dat.loc[(patient_dat.uniquepid.isin(patient_dat_new.loc[patient_dat_new.ethnicity.isnull(),'uniquepid']) & (patient_dat.patientUnitStayID != pid_missing_ethnicity)),'ethnicity'].values

In [32]:
## Create a new column with dummy coding ethnicity as Caucasian = 1, and 0 otherwise
patient_dat_new['ethnicity_cat'] = patient_dat_new['ethnicity'].apply(lambda x: 1 if x=='Caucasian' else 0)

# Extract and clean labs data
### We consider two labs in our analysis: Creatinine (blood test) and Lactate

In [33]:
# Extract relevant lab data
labs_dat_query = \
"""
SELECT *
FROM `physionet-data.eicu_crd.lab` io
WHERE (io.labname is null OR io.labname LIKE '%lactate%' OR io.labname LIKE '%creatinine%')
"""

labs_dat = run_query(labs_dat_query)

In [34]:
labs_dat.head()

Unnamed: 0,labid,patientunitstayid,labresultoffset,labtypeid,labname,labresult,labresulttext,labmeasurenamesystem,labmeasurenameinterface,labresultrevisedoffset
0,58862858,144944,9443,4,urinary creatinine,117.85,117.85,mg/dL,mg/dL,9543
1,78420565,244059,9815,4,urinary creatinine,56.0,56.0,mg/dL,mg/dL,9864
2,231246993,962661,555,4,urinary creatinine,107.0,107.0,mg/dL,mg/dL,712
3,231339936,966484,1087,4,urinary creatinine,64.0,64.0,mg/dL,mg/dL,1171
4,231410201,972737,913,4,urinary creatinine,29.0,29.0,mg/dL,mg/dL,1021


In [35]:
labs_dat.labname.unique() ## Note that we have extracted both blood and urine creatinine

array(['urinary creatinine', 'lactate', 'creatinine'], dtype=object)

In [36]:
## Number of missings in lab values
print("missings in text lab values:" + str(labs_dat.labresulttext.isnull().sum()))

missings in text lab values:0


In [37]:
## Non-numeric lab value types
labs_dat.loc[np.isnan(pd.to_numeric(labs_dat.labresulttext, errors='coerce')), 'labresulttext'].unique()

array(['>200.0', '<0.3', '<0.5', '>15.0', '<.1', '>19.79', '>13.3', '<.3',
       '<0.1', '>19.31', '<0.4', '<0.50', '<0.2', '>12.0', '<0.30',
       '<0.20', '<0.17', '<0.15', '<0.10', '>20.0', '<0.40', '<0.150',
       '<5.0', '<10', '<0.14', '<0.60', '>17.75', '>19.34', '>12.2',
       '>18.73', '>20.00', '>13.0', '<1.0', '>346.50', '<1.1', '>23.3',
       '<0.2  ', '>16.0', '>18.49', '<10.0', '>20.50', '<0.0', '>16.00',
       '>17.00', '>30.0', '>25.00', '>16.4', '', '<0.7', '>37.00',
       '>18.5', '<1.00', '<1.0  ', '>15.5', '>26.6', '<0.55', '>10.8',
       '<0.04', '<0.05', '<4.2', '>15.00', '<17', '>18.0', '>28.0',
       '<60.0', '>17.37', '<.4', '<0.65', '>24.0'], dtype=object)

In [38]:
## Convert missing labvalues indicated as '' to none
labs_dat.loc[labs_dat.labresulttext=='', 'labresulttext'] = None

In [39]:
## Note that there are more additional lab values present in 'labresulttext' column.
## Therefore we consider 'labresulttext' column to extract values.
print("missings in text lab values:" + str(labs_dat.labresulttext.isnull().sum()))
print("missings in numeric lab values:" + str(labs_dat.labresult.isnull().sum()))
print("Difference in lab values (text vs numeric):" + str(sum(labs_dat.loc[~labs_dat.labresult.isnull(), "labresult"] -labs_dat.loc[~labs_dat.labresult.isnull(), "labresulttext"].astype('float'))))

labs_dat.drop(['labresult', 'labmeasurenameinterface', 'labid', 'labtypeid'], axis = 1, inplace=True)
labs_dat.head()

missings in text lab values:4
missings in numeric lab values:3008
Difference in lab values (text vs numeric):0.0


Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem,labresultrevisedoffset
0,144944,9443,urinary creatinine,117.85,mg/dL,9543
1,244059,9815,urinary creatinine,56.0,mg/dL,9864
2,962661,555,urinary creatinine,107.0,mg/dL,712
3,966484,1087,urinary creatinine,64.0,mg/dL,1171
4,972737,913,urinary creatinine,29.0,mg/dL,1021


##### In what follows we extract nearest lab value drawn within +/- 12 hours from the third vasopressor starting time. In order to do that we:

  ##### (1) Filter out all the lab values that were not drawn within +/- 12 hours from the third vasopressor starting time.

  ##### (2) We extract the lab value drawn at a closest time to third vasopressor starting time, giving the priority for values drawn prior to third vasopressor starting time. In otherwords, we obtain lab values drawn after third vasopressor starting time only for the cases where there is no available lab value drawn prior to third vasopressor starting time.

In [40]:
## Funcion "filter_labs" extract closest lab values drawn to the third vasopressor starting time
## Input: grouped lab data by 'patientunitstayid'
##        Input data should have labs values only within +/- 12 hours from the third vasopressor starting time.
##        Input data should also have a column called 'time_diff' which contains the time offset of the third vasopressor start time from the time the lab value was drawn 
## Output: Returns the lab value drawn closest to the third vasopressor starting time. Note that we give priority for lab values drawn prior to the third vasopressor starting time
def filter_labs(grouped_dat):
  if (any(grouped_dat.time_diff < 0)):
    return(grouped_dat.loc[grouped_dat.loc[grouped_dat.time_diff < 0, 'time_diff'].idxmax(), 'labresulttext'])
  else:
    return(grouped_dat.loc[grouped_dat.loc[grouped_dat.time_diff >= 0, 'time_diff'].idxmin(), 'labresulttext'])

### Clean creatinine lab data

In [41]:
## We only consider blood creatinine in our analysis
labs_dat_creatinine = labs_dat.loc[labs_dat.labname == 'creatinine',:].copy() 

## Check if all values are meassured with same units
print("units of blood creatinine:"+ str(labs_dat_creatinine.labmeasurenamesystem.unique()))

## Some lab values were revised at a later time given in the column 'labresultrevisedoffset'.
## Therefore, we take the most recently revised lab value for the records where lab values were drawn exactly at the same time
labs_dat_creatinine["rank"] = labs_dat_creatinine.groupby(['patientunitstayid',	'labresultoffset'])['labresultrevisedoffset'].rank("first", ascending=False)
labs_dat_creatinine = labs_dat_creatinine.loc[labs_dat_creatinine['rank']==1,:].drop(['rank','labresultrevisedoffset'], axis = 1)

## Remove records with missing lab values
labs_dat_creatinine = labs_dat_creatinine.loc[~labs_dat_creatinine.labresulttext.isnull(), :]

# Merge labs data with vasopressor data
labs_dat_creatinine_new = third_vasp_time_dat.merge(labs_dat_creatinine, how='left', on='patientunitstayid')

# Create a column with time offset of the third vasopressor starting time to the time when the lab was drawn
labs_dat_creatinine_new['time_diff'] = labs_dat_creatinine_new['labresultoffset']-labs_dat_creatinine_new['third_vasp_drugstartoffset']

## Filtering out lab values that were not drawn within +/- 12 hours from third vasopressor starting time
labs_dat_creatinine_new['lower_bound'] = labs_dat_creatinine_new['third_vasp_drugstartoffset'] - 720
labs_dat_creatinine_new['upper_bound'] = labs_dat_creatinine_new['third_vasp_drugstartoffset'] + 720
labs_dat_creatinine_new = labs_dat_creatinine_new.loc[(labs_dat_creatinine_new.labresultoffset>=labs_dat_creatinine_new.lower_bound) & (labs_dat_creatinine_new.labresultoffset<=labs_dat_creatinine_new.upper_bound), :]

## Extract nearest creatinine value to the third vasopressor starting time
creatinine_val_dat = labs_dat_creatinine_new.groupby(['patientunitstayid']).apply(filter_labs).reset_index(name='creatinine')


units of blood creatinine:['mg/dL']


In [42]:
## There are non-numeric creatinine values. Therefore, we impute non-numeric creatinine lab values
creatinine_val_dat['creatinine_imputed'] = pd.to_numeric(creatinine_val_dat.creatinine, errors='coerce')
creatinine_val_dat.loc[np.isnan(pd.to_numeric(creatinine_val_dat.creatinine, errors='coerce')), 'creatinine_imputed']=creatinine_val_dat.loc[np.isnan(pd.to_numeric(creatinine_val_dat.creatinine, errors='coerce')),'creatinine'].apply(lambda x: x[1:])

creatinine_val_dat[np.isnan(pd.to_numeric(creatinine_val_dat.creatinine, errors='coerce'))] 

Unnamed: 0,patientunitstayid,creatinine,creatinine_imputed
890,1346834,<0.30,0.3
1188,2305583,<0.40,0.4
1198,2403553,<0.4,0.4


In [43]:
## Number of missings for lactate
third_vasp_time_dat.shape[0]-creatinine_val_dat.shape[0]

59

In [44]:
## Percentage of missings for lactate
((third_vasp_time_dat.shape[0]-creatinine_val_dat.shape[0])/third_vasp_time_dat.shape[0])*100

3.641975308641975

##### Since it is highly unlikely to have missing creatinine for ICU patients, we will consider values outside of +/-12 hours of third vasopressor start time to impute missing values. Following are the steps to impute missing values:

(1) If the last creatinine value is prior to -12 hours of third vasopressor start time, we use the last value available.

(2) If creatinine values are available prior to -12 hours and after +12 hours of third vasopressor start time but not within +/- 12 hours, then we use the nearest one to the start time of the 3rd vasopressor.

(3) If the first available creatinine value was drawn after +12 hours of third vasopressor start time, we take that value as long as the third vasopressor is still running (i.e., third vasopressor stop time >= creatinine lab timestamp)

If creatinine is still missing after these imputations, it might be the case that the corresponding records are not from ICU stays and therefore, we drop those records from the analysis

In [45]:
## ICU stays with missing creatinine values
missing_creatinine_pids = list(third_vasp_time_dat.loc[~third_vasp_time_dat.patientunitstayid.isin(creatinine_val_dat.patientunitstayid), 'patientunitstayid'])

In [46]:
## All lab value records for ICU stays with missing creatinine within the window +/-12 hours from third vasopressor start time
labs_dat_creatinine_missing_pid = labs_dat_creatinine.loc[labs_dat_creatinine.patientunitstayid.isin(missing_creatinine_pids), :].copy()
labs_dat_creatinine_missing_pid.head()

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem
14038,156843,49628,creatinine,0.48,mg/dL
17076,156843,28033,creatinine,0.63,mg/dL
19153,461470,1867,creatinine,2.73,mg/dL
20263,429502,2246,creatinine,0.59,mg/dL
20827,830055,23928,creatinine,0.31,mg/dL


In [47]:
labs_dat_creatinine_missing_pid.patientunitstayid.nunique()  ## 48 unique ICU stays have at least one value recorded for creatinine out of 59 missings
## Therefore, we will have to drop 59-48=11 records from the analysis due to no creatinine available

48

In [48]:
# Merge labs data with vasopressor data
labs_dat_creatinine_missing_pid = labs_dat_creatinine_missing_pid.merge(third_vasp_time_dat[['patientunitstayid', 'third_vasp_drugstartoffset', 'third_vasp_drugstopoffset']], how='left', on='patientunitstayid')
labs_dat_creatinine_missing_pid.head()

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem,third_vasp_drugstartoffset,third_vasp_drugstopoffset
0,156843,49628,creatinine,0.48,mg/dL,56853,57844
1,156843,28033,creatinine,0.63,mg/dL,56853,57844
2,461470,1867,creatinine,2.73,mg/dL,6802,11153
3,429502,2246,creatinine,0.59,mg/dL,3252,4433
4,830055,23928,creatinine,0.31,mg/dL,47414,50590


In [49]:
# Create a column with time offset of the third vasopressor starting time to the time when the lab was drawn
labs_dat_creatinine_missing_pid['time_diff'] = labs_dat_creatinine_missing_pid['labresultoffset']-labs_dat_creatinine_missing_pid['third_vasp_drugstartoffset']
labs_dat_creatinine_missing_pid.head()

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem,third_vasp_drugstartoffset,third_vasp_drugstopoffset,time_diff
0,156843,49628,creatinine,0.48,mg/dL,56853,57844,-7225
1,156843,28033,creatinine,0.63,mg/dL,56853,57844,-28820
2,461470,1867,creatinine,2.73,mg/dL,6802,11153,-4935
3,429502,2246,creatinine,0.59,mg/dL,3252,4433,-1006
4,830055,23928,creatinine,0.31,mg/dL,47414,50590,-23486


In [50]:
## Extract last creatinine value for missing records
last_creatinine_dat = labs_dat_creatinine_missing_pid.copy()
last_creatinine_dat["rank"] = last_creatinine_dat.groupby(['patientunitstayid'])['labresultoffset'].rank("first", ascending=False)
last_creatinine_dat = last_creatinine_dat.loc[last_creatinine_dat['rank'] == 1, :].drop(['rank'], axis = 1)
last_creatinine_dat.head()

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem,third_vasp_drugstartoffset,third_vasp_drugstopoffset,time_diff
15,753061,853,creatinine,2.63,mg/dL,2123,2614,-1270
22,2094461,6723,creatinine,0.86,mg/dL,210,3027,6513
35,156843,55858,creatinine,0.55,mg/dL,56853,57844,-995
44,429502,4231,creatinine,1.94,mg/dL,3252,4433,979
59,1118949,8735,creatinine,0.55,mg/dL,650,4703,8085


In [51]:
## Records imputed with last creatinine
last_creatinine_imputed_dat = last_creatinine_dat.loc[last_creatinine_dat.time_diff < 0, ['patientunitstayid', 'labresulttext']].copy()
last_creatinine_imputed_dat.rename(columns={"labresulttext": "creatinine"}, inplace=True)
last_creatinine_imputed_dat.head()

Unnamed: 0,patientunitstayid,creatinine
15,753061,2.63
35,156843,0.55
60,858255,0.96
64,592952,1.39
77,684881,2.67


In [52]:
last_creatinine_imputed_dat.shape[0] ## Out of 48, only 33 records were imputed with last available creatinine value

33

In [53]:
## Funcion "nearest_lab" extract nearest lab values drawn to the third vasopressor starting time 
## when there is no value available within +/-12 hours but values are available both before -12 hours
## and after +12 hours of 3rd vasopressor start time
## Input: grouped lab data by 'patientunitstayid'
##        Input data should have labs values only for ICU stays that have no value available within +/- 12 hours from the third vasopressor starting time.
##        Input data should also have a column called 'abs_time_diff' which contains the absolute value of the time offset between third vasopressor start time and the time the lab value was drawn 
## Output: Returns the lab value drawn nearest to the third vasopressor starting time.

def nearest_lab(grouped_dat):
  if (any(grouped_dat.labresultoffset < grouped_dat.lower_bound) &  any(grouped_dat.labresultoffset > grouped_dat.upper_bound)):
    return(grouped_dat.loc[grouped_dat.loc[:, 'abs_time_diff'].idxmin(), 'labresulttext'])

In [54]:
## Other records not imputed with last creatinine values
labs_dat_creatinine_missing_pid_new = labs_dat_creatinine_missing_pid.loc[~labs_dat_creatinine_missing_pid.patientunitstayid.isin(last_creatinine_imputed_dat.patientunitstayid), :].copy()
labs_dat_creatinine_missing_pid_new['lower_bound'] = labs_dat_creatinine_missing_pid_new['third_vasp_drugstartoffset'] - 720
labs_dat_creatinine_missing_pid_new['upper_bound'] = labs_dat_creatinine_missing_pid_new['third_vasp_drugstartoffset'] + 720
labs_dat_creatinine_missing_pid_new['abs_time_diff'] = abs(labs_dat_creatinine_missing_pid_new['time_diff'])

In [55]:
## Extract nearest creatinine value to the third vasopressor starting time
nearest_creatinine_val_dat = labs_dat_creatinine_missing_pid_new.groupby(['patientunitstayid']).apply(nearest_lab).reset_index(name='creatinine')
nearest_creatinine_val_dat = nearest_creatinine_val_dat.loc[~nearest_creatinine_val_dat.creatinine.isnull(), :]
nearest_creatinine_val_dat.head()

Unnamed: 0,patientunitstayid,creatinine
0,157582,2.4
1,429502,1.94
2,461470,2.47
3,830055,0.52
4,943730,2.75


In [56]:
nearest_creatinine_val_dat.shape[0] ## Out of remaining 48-33=15, 12 records were imputed with nearest available lab value outside +/-12 hours window

12

In [57]:
## Other records not imputed with nearest creatinine value
labs_dat_creatinine_missing_pid_last = labs_dat_creatinine_missing_pid_new.loc[~labs_dat_creatinine_missing_pid_new.patientunitstayid.isin(nearest_creatinine_val_dat.patientunitstayid), :].copy()
labs_dat_creatinine_missing_pid_last.head()

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem,third_vasp_drugstartoffset,third_vasp_drugstopoffset,time_diff,lower_bound,upper_bound,abs_time_diff
21,1329588,883,creatinine,0.6,mg/dL,93,3131,790,-627,813,790
22,2094461,6723,creatinine,0.86,mg/dL,210,3027,6513,-510,930,6513
24,1118949,1575,creatinine,0.79,mg/dL,650,4703,925,-70,1370,925
27,1118949,2952,creatinine,0.78,mg/dL,650,4703,2302,-70,1370,2302
47,1329588,18305,creatinine,0.78,mg/dL,93,3131,18212,-627,813,18212


In [58]:
(labs_dat_creatinine_missing_pid_last.labresultoffset > labs_dat_creatinine_missing_pid_last.upper_bound).sum() == labs_dat_creatinine_missing_pid_last.shape[0] ## Note for all records, lab values are drawn after +12 hours of third vasopressor start time

True

In [59]:
## We take the first available creatinine value after third vasopressor start time + 12 hours only if third vasopressor stop time is later that when lab was drawn
## Therefore, we drop redords with labresultoffset > third vasopressor stop time
labs_dat_creatinine_missing_pid_last = labs_dat_creatinine_missing_pid_last.loc[(labs_dat_creatinine_missing_pid_last.labresultoffset <= labs_dat_creatinine_missing_pid_last.third_vasp_drugstopoffset),:]

In [60]:
## Extract first creatinine value 
first_creatinine_dat = labs_dat_creatinine_missing_pid_last.copy()
first_creatinine_dat["rank"] = first_creatinine_dat.groupby(['patientunitstayid'])['labresultoffset'].rank("first", ascending=True)
first_creatinine_dat = first_creatinine_dat.loc[first_creatinine_dat['rank'] == 1, ['patientunitstayid', 'labresulttext']]
first_creatinine_dat.rename(columns={"labresulttext": "creatinine"}, inplace=True)
first_creatinine_dat.head()

Unnamed: 0,patientunitstayid,creatinine
21,1329588,0.6
24,1118949,0.79
124,2094461,0.85


In [61]:
## Combine all imputed records of creatinine
imputed_creatinine_dat_all = pd.concat([last_creatinine_imputed_dat, nearest_creatinine_val_dat, first_creatinine_dat], ignore_index=True)
print('Number of creatinine values with symbols = ' + str(np.isnan(pd.to_numeric(imputed_creatinine_dat_all.creatinine, errors='coerce')).sum()))
imputed_creatinine_dat_all['creatinine_imputed'] = pd.to_numeric(imputed_creatinine_dat_all.creatinine, errors='coerce')
imputed_creatinine_dat_all.head()

Number of creatinine values with symbols = 0


Unnamed: 0,patientunitstayid,creatinine,creatinine_imputed
0,753061,2.63,2.63
1,156843,0.55,0.55
2,858255,0.96,0.96
3,592952,1.39,1.39
4,684881,2.67,2.67


In [62]:
## Combine imputed records of creatinine with creatinine data extracted within +/-12 hours of 3rd vasopressor start time
creatinine_val_dat = pd.concat([creatinine_val_dat, imputed_creatinine_dat_all], ignore_index=True)

### Clean lactate lab data

In [63]:
## Lactate lab values
labs_dat_lactate = labs_dat.loc[labs_dat.labname == 'lactate',:].copy() 

## Check if all values are meassured with same units
print("units of lactate:"+ str(labs_dat_lactate.labmeasurenamesystem.unique()))

## Some lab values were revised at a later time given in the column 'labresultrevisedoffset'.
## Therefore, we take the most recently revised lab value for the records where lab values were drawn exactly at the same time
labs_dat_lactate["rank"] = labs_dat_lactate.groupby(['patientunitstayid',	'labresultoffset'])['labresultrevisedoffset'].rank("first", ascending=False)
labs_dat_lactate = labs_dat_lactate.loc[labs_dat_lactate['rank']==1,:].drop(['rank','labresultrevisedoffset'], axis = 1)

## Remove records with missing lab values
labs_dat_lactate = labs_dat_lactate.loc[~labs_dat_lactate.labresulttext.isnull(), :]

# Merge labs data with vasopressor data
labs_dat_lactate_new = third_vasp_time_dat.merge(labs_dat_lactate, how='left', on='patientunitstayid')

# Create a column with time offset of the third vasopressor starting time to the time when the lab was drawn
labs_dat_lactate_new['time_diff'] = labs_dat_lactate_new['labresultoffset']-labs_dat_lactate_new['third_vasp_drugstartoffset']

## Filtering out lab values that were not drawn within +/- 12 hours from third vasopressor starting time
labs_dat_lactate_new['lower_bound'] = labs_dat_lactate_new['third_vasp_drugstartoffset'] - 720
labs_dat_lactate_new['upper_bound'] = labs_dat_lactate_new['third_vasp_drugstartoffset'] + 720
labs_dat_lactate_new = labs_dat_lactate_new.loc[(labs_dat_lactate_new.labresultoffset>=labs_dat_lactate_new.lower_bound) & (labs_dat_lactate_new.labresultoffset<=labs_dat_lactate_new.upper_bound), :]

## Extract nearest lactate value to the third vasopressor starting time
lactate_val_dat = labs_dat_lactate_new.groupby(['patientunitstayid']).apply(filter_labs).reset_index(name='lactate')


units of lactate:['mmol/L']


In [64]:
## There are non-numeric lactate values. Therefore, we impute non-numeric lactate lab values
lactate_val_dat['lactate_imputed'] = pd.to_numeric(lactate_val_dat.lactate, errors='coerce')
lactate_val_dat.loc[np.isnan(pd.to_numeric(lactate_val_dat.lactate, errors='coerce')), 'lactate_imputed']=lactate_val_dat.loc[np.isnan(pd.to_numeric(lactate_val_dat.lactate, errors='coerce')),'lactate'].apply(lambda x: x[1:])

lactate_val_dat[np.isnan(pd.to_numeric(lactate_val_dat.lactate, errors='coerce'))] 

Unnamed: 0,patientunitstayid,lactate,lactate_imputed
94,334333,>13.3,13.3
202,644624,>12.0,12.0
230,678714,>20.0,20.0
478,1060148,>17.75,17.75
767,1513194,>20.00,20.0
896,2359321,>12.2,12.2
1011,3169225,<0.4,0.4
1014,3176837,>15.0,15.0
1015,3178924,>15.0,15.0
1120,3239318,>13.0,13.0


In [65]:
## Number of missings for lactate
third_vasp_time_dat.shape[0]-lactate_val_dat.shape[0]

464

In [66]:
## Percentage of missings for lactate
((third_vasp_time_dat.shape[0]-lactate_val_dat.shape[0])/third_vasp_time_dat.shape[0])*100

28.641975308641975

##### Since we have a large number of recoreds with missing lactate values, we will consider values outside of +/-12 hours of third vasopressor start time to impute missing values. Following are the steps to impute missing values:

(1) If the last lactate value is prior to -12 hours of third vasopressor start time, we use the last lactate value available.

(2) If lactate values are available prior to -12 hours and after +12 hours of third vasopressor start time but not within +/- 12 hours, then we use the nearest one to the start time of the 3rd vasopressor.

(3) If the first available lactate value was drawn after +12 hours of third vasopressor start time, we take that value as long as the third vasopressor is still running (i.e., third vasopressor stop time >= lactate lab timestamp)

In [67]:
## ICU stays with missing lactate values
missing_lactate_pids = list(third_vasp_time_dat.loc[~third_vasp_time_dat.patientunitstayid.isin(lactate_val_dat.patientunitstayid), 'patientunitstayid'])

In [68]:
## All lab value records for ICU stays with missing lactate within the window +/-12 hours from third vasopressor start time
labs_dat_lactate_missing_pid = labs_dat_lactate.loc[labs_dat_lactate.patientunitstayid.isin(missing_lactate_pids), :].copy()
labs_dat_lactate_missing_pid.head()

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem
1478,2916248,887,lactate,14.9,mmol/L
1591,188948,742,lactate,1.2,mmol/L
1978,183431,1120,lactate,9.2,mmol/L
2037,508124,305,lactate,2.3,mmol/L
2289,1128129,21039,lactate,1.3,mmol/L


In [69]:
labs_dat_lactate_missing_pid.patientunitstayid.nunique() ## Only 195 unique ICU stays have any value recorded for lactate out of 464 missings

195

In [70]:
# Merge labs data with vasopressor data
labs_dat_lactate_missing_pid = labs_dat_lactate_missing_pid.merge(third_vasp_time_dat[['patientunitstayid', 'third_vasp_drugstartoffset', 'third_vasp_drugstopoffset']], how='left', on='patientunitstayid')
labs_dat_lactate_missing_pid.head()

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem,third_vasp_drugstartoffset,third_vasp_drugstopoffset
0,2916248,887,lactate,14.9,mmol/L,66,2379
1,188948,742,lactate,1.2,mmol/L,2317,6669
2,183431,1120,lactate,9.2,mmol/L,150,1424
3,508124,305,lactate,2.3,mmol/L,10714,11053
4,1128129,21039,lactate,1.3,mmol/L,1074,7164


In [71]:
# Create a column with time offset of the third vasopressor starting time to the time when the lab was drawn
labs_dat_lactate_missing_pid['time_diff'] = labs_dat_lactate_missing_pid['labresultoffset']-labs_dat_lactate_missing_pid['third_vasp_drugstartoffset']
labs_dat_lactate_missing_pid.head()

Unnamed: 0,patientunitstayid,labresultoffset,labname,labresulttext,labmeasurenamesystem,third_vasp_drugstartoffset,third_vasp_drugstopoffset,time_diff
0,2916248,887,lactate,14.9,mmol/L,66,2379,821
1,188948,742,lactate,1.2,mmol/L,2317,6669,-1575
2,183431,1120,lactate,9.2,mmol/L,150,1424,970
3,508124,305,lactate,2.3,mmol/L,10714,11053,-10409
4,1128129,21039,lactate,1.3,mmol/L,1074,7164,19965


In [72]:
## Extract last lactate value for missing records
last_lactate_dat = labs_dat_lactate_missing_pid.copy()
last_lactate_dat["rank"] = last_lactate_dat.groupby(['patientunitstayid'])['labresultoffset'].rank("first", ascending=False)
last_lactate_dat = last_lactate_dat.loc[last_lactate_dat['rank'] == 1, :].drop(['rank'], axis = 1)

## Records imputed with last lactate
last_lactate_imputed_dat = last_lactate_dat.loc[last_lactate_dat.time_diff < 0, ['patientunitstayid', 'labresulttext']].copy()
last_lactate_imputed_dat.rename(columns={"labresulttext": "lactate"}, inplace=True)
last_lactate_imputed_dat.head()

Unnamed: 0,patientunitstayid,lactate
8,508124,1.5
10,1065411,1.8
14,1071727,3.8
17,778245,2.4
20,935315,8.1


In [73]:
## Other records not imputed with last lactate values
labs_dat_lactate_missing_pid_new = labs_dat_lactate_missing_pid.loc[~labs_dat_lactate_missing_pid.patientunitstayid.isin(last_lactate_imputed_dat.patientunitstayid), :].copy()
labs_dat_lactate_missing_pid_new['lower_bound'] = labs_dat_lactate_missing_pid_new['third_vasp_drugstartoffset'] - 720
labs_dat_lactate_missing_pid_new['upper_bound'] = labs_dat_lactate_missing_pid_new['third_vasp_drugstartoffset'] + 720
labs_dat_lactate_missing_pid_new['abs_time_diff'] = abs(labs_dat_lactate_missing_pid_new['time_diff'])

## Extract nearest lactate value to the third vasopressor starting time
nearest_lactate_val_dat = labs_dat_lactate_missing_pid_new.groupby(['patientunitstayid']).apply(nearest_lab).reset_index(name='lactate')
nearest_lactate_val_dat = nearest_lactate_val_dat.loc[~nearest_lactate_val_dat.lactate.isnull(), :]
nearest_lactate_val_dat.head()

Unnamed: 0,patientunitstayid,lactate
3,188948,1.5
5,197253,3.0
8,467036,10.6
9,488678,2.2
12,528316,5.1


In [74]:
## Other records not imputed with nearest lactate value
labs_dat_lactate_missing_pid_last = labs_dat_lactate_missing_pid_new.loc[~labs_dat_lactate_missing_pid_new.patientunitstayid.isin(nearest_lactate_val_dat.patientunitstayid), :].copy()

In [75]:
(labs_dat_lactate_missing_pid_last.labresultoffset > labs_dat_lactate_missing_pid_last.upper_bound).sum() == labs_dat_lactate_missing_pid_last.shape[0] ## Note for all records, lab values are drawn after +12 hours of third vasopressor start time

True

In [76]:
## Extract first lactate value for missing records
first_lactate_dat = labs_dat_lactate_missing_pid_last.copy()
first_lactate_dat["rank"] = first_lactate_dat.groupby(['patientunitstayid'])['labresultoffset'].rank("first", ascending=True)
first_lactate_dat = first_lactate_dat.loc[first_lactate_dat['rank'] == 1, :].drop(['rank'], axis = 1)

In [77]:
## We take the first available lactate value after third vasopressor start time + 12 hours only if third vasopressor stop time is later that when lab was drawn
## Therefore, we drop redords with labresultoffset > third vasopressor stop time
first_lactate_dat = first_lactate_dat.loc[(first_lactate_dat.labresultoffset <= first_lactate_dat.third_vasp_drugstopoffset),['patientunitstayid', 'labresulttext']]
first_lactate_dat.rename(columns={"labresulttext": "lactate"}, inplace=True)
first_lactate_dat.head()

Unnamed: 0,patientunitstayid,lactate
0,2916248,14.9
2,183431,9.2
16,726416,2.1
64,1783949,10.6
67,1826965,3.2


In [78]:
## Combine all imputed records of lactate
imputed_lactate_dat_all = pd.concat([last_lactate_imputed_dat, nearest_lactate_val_dat, first_lactate_dat],ignore_index=True)
print('Number of lactate values with symbols = ' + str(np.isnan(pd.to_numeric(imputed_lactate_dat_all.lactate, errors='coerce')).sum()))
imputed_lactate_dat_all['lactate_imputed'] = pd.to_numeric(imputed_lactate_dat_all.lactate, errors='coerce')
imputed_lactate_dat_all.head()

Number of lactate values with symbols = 0


Unnamed: 0,patientunitstayid,lactate,lactate_imputed
0,508124,1.5,1.5
1,1065411,1.8,1.8
2,1071727,3.8,3.8
3,778245,2.4,2.4
4,935315,8.1,8.1


In [79]:
## Combine imputed records of lactate with lactate data extracted within +/-12 hours of 3rd vasopressor start time
lactate_val_dat = pd.concat([lactate_val_dat, imputed_lactate_dat_all],ignore_index=True)

In [80]:
## Merge all labs data 
labs_final = creatinine_val_dat.merge(lactate_val_dat, how='outer', on='patientunitstayid')

In [81]:
## Merge patient data and extracted lab data with third vasopressor data 
final_dat = third_vasp_time_dat.merge(patient_dat_new, how='left', left_on='patientunitstayid', right_on='patientUnitStayID').drop(['patientUnitStayID', 'third_vasp_drugstartoffset_original'], axis = 1)
final_dat = final_dat.merge(labs_final, how='left', on='patientunitstayid')
final_dat.head()

Unnamed: 0,patientunitstayid,third_vasp_drugstartoffset,third_vasp_drugstopoffset,drugname,first_vasp_drugstartoffset,second_vasp_drugstartoffset,drugname_first_vasp,drugname_second_vasp,uniquepid,patientHealthSystemStayID,gender,age,ethnicity,unitType,hospitalDischargeLocation,hospitalDischargeStatus,hospitalAdmitOffset,unitVisitNumber,age_int,ethnicity_cat,creatinine,creatinine_imputed,lactate,lactate_imputed
0,144173,107,390,vasopressin,47,92,norepinephrine,phenylephrine,002-2200,131256,Female,40,Asian,Med-Surg ICU,Home,Alive,-407,1,40,0,1.4,1.4,11.6,11.6
1,145394,2982,10012,phenylephrine,222,2982,norepinephrine,vasopressin,002-33362,132182,Male,61,Caucasian,Med-Surg ICU,Other Hospital,Alive,-19,1,61,1,3.66,3.66,2.2,2.2
2,146133,1663,2039,dopamine,43,268,norepinephrine,vasopressin,002-67165,132764,Male,28,Caucasian,Med-Surg ICU,Death,Expired,-1,1,28,1,3.09,3.09,8.2,8.2
3,146349,677,3286,phenylephrine,497,497,vasopressin,norepinephrine,002-26080,132933,Male,56,Native American,Med-Surg ICU,Skilled Nursing Facility,Alive,0,1,56,0,4.14,4.14,0.9,0.9
4,147985,369,3009,norepinephrine,174,279,vasopressin,epinephrine,002-50987,134197,Female,78,African American,Med-Surg ICU,Death,Expired,-1,1,78,0,2.1,2.1,12.9,12.9


In [88]:
## Drop records with missing creatinine values
final_dat = final_dat.loc[~final_dat.creatinine_imputed.isnull(), :]

In [92]:
## Note that there are multiple ICU stays within the same hospitalization in our cohort
final_dat.groupby(by=['uniquepid','patientHealthSystemStayID']).size()[final_dat.groupby(by=['uniquepid','patientHealthSystemStayID']).size()>1]

uniquepid   patientHealthSystemStayID
006-240684  508308                       2
006-249436  663463                       2
006-49071   602283                       2
006-53063   609211                       2
010-17465   836444                       2
013-14553   1012485                      3
013-15171   1032699                      2
013-38068   1003589                      2
033-10587   2645975                      2
dtype: int64

In [114]:
## Extract first ICU stay from the multiple ICU stays of the same hospitalization
first_ICU_stay_final_dat = final_dat.copy()
first_ICU_stay_final_dat["rank"] = first_ICU_stay_final_dat.groupby(['uniquepid','patientHealthSystemStayID'])['hospitalAdmitOffset'].rank("first", ascending=False)
first_ICU_stay_final_dat = first_ICU_stay_final_dat.loc[first_ICU_stay_final_dat['rank'] == 1, :].drop(['rank', 'unitVisitNumber'], axis = 1)

Unnamed: 0,patientunitstayid,third_vasp_drugstartoffset,third_vasp_drugstopoffset,drugname,first_vasp_drugstartoffset,second_vasp_drugstartoffset,drugname_first_vasp,drugname_second_vasp,uniquepid,patientHealthSystemStayID,gender,age,ethnicity,unitType,hospitalDischargeLocation,hospitalDischargeStatus,hospitalAdmitOffset,age_int,ethnicity_cat,creatinine,creatinine_imputed,lactate,lactate_imputed
0,144173,107,390,vasopressin,47,92,norepinephrine,phenylephrine,002-2200,131256,Female,40,Asian,Med-Surg ICU,Home,Alive,-407,40,0,1.4,1.4,11.6,11.6
1,145394,2982,10012,phenylephrine,222,2982,norepinephrine,vasopressin,002-33362,132182,Male,61,Caucasian,Med-Surg ICU,Other Hospital,Alive,-19,61,1,3.66,3.66,2.2,2.2
2,146133,1663,2039,dopamine,43,268,norepinephrine,vasopressin,002-67165,132764,Male,28,Caucasian,Med-Surg ICU,Death,Expired,-1,28,1,3.09,3.09,8.2,8.2
3,146349,677,3286,phenylephrine,497,497,vasopressin,norepinephrine,002-26080,132933,Male,56,Native American,Med-Surg ICU,Skilled Nursing Facility,Alive,0,56,0,4.14,4.14,0.9,0.9
4,147985,369,3009,norepinephrine,174,279,vasopressin,epinephrine,002-50987,134197,Female,78,African American,Med-Surg ICU,Death,Expired,-1,78,0,2.1,2.1,12.9,12.9


In [112]:
## Note that we still have multiple hospital admissions of the same patient
(first_ICU_stay_final_dat[['uniquepid', 'patientHealthSystemStayID']].groupby('uniquepid').nunique()>1).sum()

patientHealthSystemStayID    20
dtype: int64

In [118]:
## Records with multiple hosptal visits of the same patient
muliple_hospital_visit_dat = first_ICU_stay_final_dat.groupby('uniquepid').filter(lambda g: (g.nunique() > 1).any()).sort_values(['uniquepid'])
muliple_hospital_visit_dat.head()

Unnamed: 0,patientunitstayid,third_vasp_drugstartoffset,third_vasp_drugstopoffset,drugname,first_vasp_drugstartoffset,second_vasp_drugstartoffset,drugname_first_vasp,drugname_second_vasp,uniquepid,patientHealthSystemStayID,gender,age,ethnicity,unitType,hospitalDischargeLocation,hospitalDischargeStatus,hospitalAdmitOffset,age_int,ethnicity_cat,creatinine,creatinine_imputed,lactate,lactate_imputed
1,145394,2982,10012,phenylephrine,222,2982,norepinephrine,vasopressin,002-33362,132182,Male,61,Caucasian,Med-Surg ICU,Other Hospital,Alive,-19,61,1,3.66,3.66,2.2,2.2
60,199396,17,2642,vasopressin,17,17,norepinephrine,phenylephrine,002-33362,174133,Male,61,Caucasian,Med-Surg ICU,Skilled Nursing Facility,Alive,-26,61,1,5.2,5.2,1.8,1.8
30,169388,284,7060,dopamine,280,281,epinephrine,phenylephrine,002-70654,150725,Male,39,Caucasian,MICU,Home,Alive,-3,39,1,1.28,1.28,,
44,186505,91,6844,phenylephrine,86,87,epinephrine,dopamine,002-70654,164055,Male,39,Caucasian,MICU,Home,Alive,-13,39,1,1.2,1.2,,
154,450583,856,2327,dopamine,151,751,norepinephrine,vasopressin,005-13525,383190,Female,88,Hispanic,Med-Surg ICU,Death,Expired,-55,88,0,1.72,1.72,1.3,1.3


In [None]:
## However, in eICU there is no way to find the first hospital admission of the same patient.
## Therefore, we need to find a way to select only one record for each unique patient.
## Pick a random one? or pick the one with worst outcome (unit discharge status)?