# Sepsis EHR Data Extraction

In [1]:
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tempo_ql import QueryEngine, FileVariableStore, MEDSDataset, GenericDataset, formats, TimeSeriesSet

In [62]:
dataset_name = 'ehrshot' # or 'mimiciv' or 'ehrshot'
data_path = f'../../../sepsis-reasoning-dashboard/data/{dataset_name}'
gemini_key = open('../../gemini_key.txt').read().strip()

reset_cohort = False # set to True to remove cohort definition if you've already created one using this script

# GCP project in which to run queries - make sure it has access to MIMIC-IV through physionet.org
project_id = "ai-clinician"
# name of a dataset within your project to store temporary results. Required if you plan to subset the data to run queries
scratch_dataset = "ai-clinician.tempo_ql_scratch_" + dataset_name

In [63]:
if not os.path.exists(data_path): os.mkdir(data_path)

var_store = FileVariableStore(os.path.join(data_path, '_cache'))

if dataset_name == 'eicu':
    dataset = GenericDataset(f'bigquery://{project_id}', formats.eicu(), 
                         scratch_schema_name=scratch_dataset, 
                         time_field_transform=lambda x: x * 60)
    if reset_cohort: dataset.reset_trajectory_ids()

    if not os.path.exists(data_path): os.mkdir(data_path)
    query_engine = QueryEngine(dataset, variable_stores=[var_store])
elif dataset_name == 'mimiciv':
    dataset = GenericDataset(f'bigquery://{project_id}', formats.mimiciv(), 
                         scratch_schema_name=scratch_dataset)
    if reset_cohort: dataset.reset_trajectory_ids()

    if not os.path.exists(data_path): os.mkdir(data_path)
    query_engine = QueryEngine(dataset, variable_stores=[var_store])
elif dataset_name == 'ehrshot':
    dataset = MEDSDataset(os.path.join(data_path, "data/*.parquet"), os.path.join(data_path, "metadata/*.parquet"),
                          connection_string='duckdb:///' + os.path.join(data_path, 'variables.db'))
    if reset_cohort: dataset.reset_trajectory_ids()

    query_engine = QueryEngine(dataset, variable_stores=[var_store])



# Cohort Definition

We want to select patients who have an antibiotic and a culture taken within 24 hours, or a diagnosis code with sepsis. We also want to exclude patients under 18 years old and patients who are in the ICU for at least 4 hours.

In [None]:
query_engine.interactive(file_path='test.json', api_key=gemini_key)

TempoQLWidget(api_status='Configured', file_contents={'Query1': '', 'Query2': '{Gender}'}, ids_length=56897, l…

In [None]:
# Uncomment to start interactive widget to edit cohorts
# query_engine.interactive(file_path=f'queries/cohort_{dataset_name}.json')

In [None]:
cohort_information = query_engine.query_from(f'queries/cohort_{dataset_name}.json', 
                                             variable_store=var_store,
                                             show_progress=True)

In [None]:
traj_ids = cohort_information['Cohort'].get_ids()[cohort_information['Cohort'].get_values() > 0]
print("Filtering to", len(traj_ids), "IDs")
dataset.set_trajectory_ids(traj_ids)

# Signal Extraction

In this stage we extract consolidated concepts for each of the variables we are ultimately interested in.

In [39]:
# Uncomment to start interactive widget to edit extracted data
query_engine.interactive(file_path=f'queries/extraction_{dataset_name}.json')

TempoQLWidget(api_status='Not configured - please provide a valid Gemini API key', file_contents={'Timestamps'…

In [41]:
query_engine.query_from(f'queries/extraction_{dataset_name}.json', 
                        variable_store=var_store, 
                        show_progress=True);

  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.pattern)
  filters = concept_col.astype(str).str.contains(self.p

# Modeling Features

Finally, we aggregate the features using a timestep definition: every 4 hours from either admission or sepsis onset to discharge.

In [42]:
model_features = query_engine.query_from(f'queries/model_features.json', 
                                         show_progress=True,
                                         query_transform=lambda _, query: f"({query}) every 4 h from ((SepsisOnset where #value < Discharge) impute Admission) to min(Discharge, Admission + 14 days)");

Model:CVP:  12%|█▏        | 26/224 [00:15<02:01,  1.63it/s]                                

def compiled_fn(ids, times, var_dac976a880dd2de=None):  return var_dac976a880dd2de
def compiled_fn(ids, times, var_dac976a880dd2de=None):  return var_dac976a880dd2de


  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np

In [None]:
from tempo_ql.data_types import TimeSeriesSet

# Write to file in a consistent order
feature_names = sorted(model_features.keys())
df = TimeSeriesSet.from_series([model_features[k].rename(k) for k in feature_names]).serialize()[1]
df.assign(**{df.columns[0]: df[df.columns[0]].astype(int)}).rename(columns={df.columns[0]: 'id', df.columns[1]: 'timestep'}).to_csv(os.path.join(data_path, "extracted_model_features.csv"), index=False, float_format='%.4g')

# Downstream Targets

Here we define the variables used as predictive targets.

In [64]:
query_engine.interactive(file_path='queries/predictive_targets.json')

TempoQLWidget(api_status='Not configured - please provide a valid Gemini API key', file_contents={'Vasopressor…

In [24]:
targets = query_engine.query_from("queries/predictive_targets.json", show_progress=True)
df = TimeSeriesSet.from_series([targets[k].rename(k) for k in targets]).serialize()[1]
df.assign(**{df.columns[0]: df[df.columns[0]].astype(int)}).rename(columns={df.columns[0]: 'id', df.columns[1]: 'timestep'}).to_csv(os.path.join(data_path, "predictive_targets.csv"), index=False, float_format='%.4g')

SOFA: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:59<00:00, 14.92s/it]


In [28]:
dataset.data[dataset.data['code'].str.contains('Domain')]

Unnamed: 0,subject_id,time,code,numeric_value,text_value,unit,omop_table
11326,115967129,2009-05-22 14:33:00,Domain/OMOP generated,,,,visit_occurrence
11328,115967129,2009-05-22 23:59:00,Domain/OMOP generated,,,,visit_occurrence
11350,115967129,2009-07-20 11:36:00,Domain/OMOP generated,,,,visit_occurrence
11403,115967129,2009-08-14 16:58:00,Domain/OMOP generated,,,,visit_occurrence
11426,115967129,2009-08-15 14:25:00,Domain/OMOP generated,,,,visit_occurrence
...,...,...,...,...,...,...,...
41656093,115973707,2013-10-17 23:59:00,Domain/OMOP generated,,,,visit_occurrence
41656171,115973707,2013-11-25 23:59:00,Domain/OMOP generated,,,,visit_occurrence
41656175,115973707,2013-11-26 11:51:00,Domain/OMOP generated,,,,visit_occurrence
41656178,115973707,2014-03-18 23:59:00,Domain/OMOP generated,,,,visit_occurrence


In [27]:
dataset.concepts[dataset.concepts[dataset.concept_name_field].str.contains("death", case=False).fillna(False)]

  dataset.concepts[dataset.concepts[dataset.concept_name_field].str.contains("death", case=False).fillna(False)]


Unnamed: 0,code,vocabulary,concept_code,description,scope
206,Domain/OMOP generated,Domain,OMOP generated,Death type,Domain
2138,SNOMED/87309006,SNOMED,87309006,Death of unknown cause,SNOMED
14688,SNOMED/199307003,SNOMED,199307003,Continuing pregnancy after intrauterine death ...,SNOMED
15391,SNOMED/117361000119104,SNOMED,117361000119104,Family history of sudden cardiac death,SNOMED
15957,SNOMED/134561000119109,SNOMED,134561000119109,Family disruption due to death of family member,SNOMED
20226,SNOMED/67313008,SNOMED,67313008,Fetal death due to termination of pregnancy,SNOMED
21479,SNOMED/14022007,SNOMED,14022007,"Fetal death, affecting management of mother",SNOMED
21721,SNOMED/472321009,SNOMED,472321009,Continuing pregnancy after intrauterine death ...,SNOMED
23110,SNOMED/230802007,SNOMED,230802007,Brainstem death,SNOMED
23550,CPT4/99490,CPT4,99490,Chronic care management services with the foll...,CPT4
