# Sepsis EHR Data Extraction

In [None]:
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tempo_ql import QueryEngine, FileVariableStore, MEDSDataset, GenericDataset, formats, TimeSeriesSet
from google.cloud import bigquery

In [None]:
dataset_name = 'eicu' # or 'mimiciv' or 'ehrshot'
data_path = f'data/{dataset_name}' 
if not os.path.exists(data_path): os.makedirs(data_path)
gemini_key = open('../../gemini_key.txt').read().strip() # replace with your Gemini key

reset_cohort = True # set to True to remove cohort definition if you've already created one using this script

# GCP project in which to run queries - make sure it has access to MIMIC-IV through physionet.org
project_id = "ai-clinician"
bq_client = bigquery.Client(project=project_id)

# name of a dataset within your project to store temporary results.
scratch_dataset = f"{project_id}.tempo_ql_scratch_" + dataset_name
try:
    bq_client.get_dataset(scratch_dataset)
except:
    print("Creating scratch dataset")
    bq_client.create_dataset(scratch_dataset)

In [None]:
if not os.path.exists(data_path): os.mkdir(data_path)

var_store = FileVariableStore(os.path.join(data_path, '_cache'))

if dataset_name == 'eicu':
    dataset = GenericDataset(f'bigquery://{project_id}', formats.eicu(), 
                         scratch_schema_name=scratch_dataset, 
                         time_field_transform=lambda x: x * 60)
    if reset_cohort: dataset.reset_trajectory_ids()

    if not os.path.exists(data_path): os.mkdir(data_path)
    query_engine = QueryEngine(dataset, variable_stores=[var_store])
elif dataset_name == 'mimiciv':
    dataset = GenericDataset(f'bigquery://{project_id}', formats.mimiciv(), 
                         scratch_schema_name=scratch_dataset)
    if reset_cohort: dataset.reset_trajectory_ids()

    if not os.path.exists(data_path): os.mkdir(data_path)
    query_engine = QueryEngine(dataset, variable_stores=[var_store])
elif dataset_name == 'ehrshot':
    dataset = MEDSDataset(os.path.join(data_path, "data/*.parquet"), os.path.join(data_path, "metadata/*.parquet"),
                          connection_string='duckdb:///' + os.path.join(data_path, 'variables.db'))
    if reset_cohort: dataset.reset_trajectory_ids()

    query_engine = QueryEngine(dataset, variable_stores=[var_store])

# Cohort Definition

We want to select patients who have an antibiotic and a culture taken within 24 hours, or a diagnosis code with sepsis. We also want to exclude patients under 18 years old and patients who are in the ICU for at least 4 hours.

In [None]:
query_engine.interactive(file_path='test.json', api_key=gemini_key)

In [None]:
# Uncomment to start interactive widget to edit cohorts
# query_engine.interactive(file_path=f'queries/cohort_{dataset_name}.json')

In [None]:
cohort_information = query_engine.query_from(f'queries/cohort_{dataset_name}.json', 
                                             variable_store=var_store,
                                             show_progress=True)

In [None]:
dataset.set_trajectory_ids_where(cohort_information['Cohort'])
print("Filtered to", len(dataset.get_ids()), "IDs")

# Signal Extraction

In this stage we extract consolidated concepts for each of the variables we are ultimately interested in.

In [None]:
# Uncomment to start interactive widget to edit extracted data
# query_engine.interactive(file_path=f'queries/extraction_{dataset_name}.json')

In [None]:
query_engine.query_from(f'queries/extraction_{dataset_name}.json', 
                        variable_store=var_store, 
                        show_progress=True);

# Modeling Features

Finally, we aggregate the features using a timestep definition: every 4 hours from either admission or sepsis onset to discharge.

In [None]:
model_features = query_engine.query_from(f'queries/model_features.json', 
                                         show_progress=True,
                                         query_transform=lambda _, query: f"({query}) every 4 h from ((SepsisOnset where #value < Discharge) impute Admission) to min(Discharge, Admission + 14 days)");

In [None]:
from tempo_ql.data_types import TimeSeriesSet

# Write to file in a consistent order
feature_names = sorted(model_features.keys())
df = TimeSeriesSet.from_series([model_features[k].rename(k) for k in feature_names]).serialize()[1]
df.assign(**{df.columns[0]: df[df.columns[0]].astype(int)}).rename(columns={df.columns[0]: 'id', df.columns[1]: 'timestep'}).to_csv(os.path.join(data_path, "extracted_model_features.csv"), index=False, float_format='%.4g')

# Downstream Targets

Here we define the variables used as predictive targets.

In [None]:
query_engine.interactive(file_path='queries/predictive_targets.json')

In [None]:
targets = query_engine.query_from("queries/predictive_targets.json", show_progress=True)
df = TimeSeriesSet.from_series([targets[k].rename(k) for k in targets]).serialize()[1]
df.assign(**{df.columns[0]: df[df.columns[0]].astype(int)}).rename(columns={df.columns[0]: 'id', df.columns[1]: 'timestep'}).to_csv(os.path.join(data_path, "predictive_targets.csv"), index=False, float_format='%.4g')