### This is a copu of ml-data-preprocessing, written to help with scaffolding for the query multiplexer.
### 02-ml-data-preprocessing is still the main preprocessing file

### ML Data pre-processing
This notebook is for loading and cleaning the data that will be used to train the ML on.
Things like patient heart rate and blood pressure readings that occurred around the time of the administration of the second dose 

It should persist the data into the "out" directory to be consumed by the ml training notebook

In [12]:
import root_config as rc
from detectdd import config
import pandas as pd

rc.configure()

from detectdd.auth_bigquery import BigQueryClient
from detectdd.serializer import Serializer

print("Loading cohort")

try:
    serializer = Serializer()
    cohort_with_icd = serializer.read_cohort()  # need to run 01-cohort.ipynb to produce the cohort
except FileNotFoundError:
    raise Exception("Need to run [01-cohort.ipynb] at least once to create the cohort file in the /out directory")

big_query = BigQueryClient.auth()


from detectdd.query_multiplexer import WhereClauseGenerator
from detectdd.query_multiplexer import QueryMultiplexer
import pandas as pd
from detectdd.auth_bigquery import BigQueryClient

cohort_with_no_ddi = pd.read_csv(config.out_dir / 'non-drug-interactions.csv')

cohort_with_no_ddi["dose_b_time"] = cohort_with_no_ddi["dose_b_time"].astype("datetime64[s]")

cohort_with_no_ddi.nunique()

# fetch this data set
data_cohort=cohort_with_icd
cohort_filename = "vitals_data_before_and_after_ABC.csv"

print(cohort_with_icd['subject_id'].nunique())
#data_cohort=cohort_with_no_ddi
#cohort_filename = "vitals_data_before_and_after_no_drug_interaction.csv"

Loading cohort
Loaded cohort from ..\out\cohort-full.out
<google.oauth2.credentials.Credentials object at 0x000002A37B04E740> mimic-iv-desktop
1544


In [13]:

# Assuming you have a Serializer class that handles reading your saved cohort data
serializer = Serializer()

# Extract unique subject_ids from the cohort data
subject_ids = data_cohort['subject_id'].unique()

# Convert the list of subject_ids to a format suitable for SQL query
subject_id_str = ', '.join([str(id) for id in subject_ids])
# print(subject_id_str)
# Now, let's proceed to fetch the vital signs for these subject_ids from MIMIC

query_multiplexer = QueryMultiplexer(big_query)

# Write a SQL query to fetch the required vitals where the subject_ids are in your cohort
query = """
SELECT stay_id, subject_id, charttime, heart_rate, sbp, dbp, mbp
FROM `physionet-data.mimiciv_derived.vitalsign`
WHERE ($where) 
    AND (heart_rate IS NOT NULL OR sbp IS NOT NULL OR dbp IS NOT NULL OR mbp IS NOT NULL)
"""

# query = f"""
# SELECT subject_id, heart_rate, sbp, dbp, mbp
# FROM `physionet-data.mimiciv_derived.vitalsign`
# WHERE subject_id IN ({subject_id_str}) limit 100"""

where_fragment = "(stay_id= $stay_id AND charttime > DATETIME_ADD('$dose_b_time', INTERVAL -720 MINUTE) AND charttime < DATETIME_ADD('$dose_b_time', INTERVAL 720 MINUTE))"

multimap_data = {k: v.tolist() for k, v in data_cohort.groupby('stay_id')['dose_b_time']}
results = query_multiplexer.multiplex_query(query, multi_map_data=multimap_data,
                                            where_clause=WhereClauseGenerator(where_fragment, "stay_id", "dose_b_time"))

Executing query 1, with 1679 pairs at 2023-11-03 11:47:38.859173
Partitioning key value pairs 1679
Number of partitions 6 with partition_size 279.8333333333333
Got result with 9007 values
Got result with 9556 values
Got result with 9552 values
Got result with 9576 values
Got result with 9315 values
Got result with 9313 values
Executing query 2, with 1172 pairs at 2023-11-03 11:49:26.221430
Single partition
Got result with 39033 values
Executing query 3, with 871 pairs at 2023-11-03 11:51:47.050386
Single partition
Got result with 28939 values
Executing query 4, with 674 pairs at 2023-11-03 11:52:45.558002
Single partition
Got result with 22923 values
Executing query 5, with 534 pairs at 2023-11-03 11:53:29.751083
Single partition
Got result with 17280 values
Executing query 6, with 442 pairs at 2023-11-03 11:54:24.752125
Single partition
Got result with 14576 values
Executing query 7, with 363 pairs at 2023-11-03 11:55:06.351172
Single partition
Got result with 11792 values
Executing q

In [14]:
# Run the query
vitals_data = results
vitals_data.describe()

Unnamed: 0,dose_b_time,subject_id,charttime,heart_rate,sbp,dbp,mbp
count,235790,235790.0,235790,181098.0,184156.0,184113.0,184207.0
mean,2154-03-01 20:05:48.659060736,14942333.980826,2154-03-01 20:06:03.497349,90.212652,116.615189,61.411769,77.415549
min,2110-03-02 18:20:00,10004733.0,2110-03-02 07:00:00,5.0,8.0,1.0,1.0
25%,2133-10-25 16:44:00,12429062.0,2133-10-25 16:06:15,76.0,101.0,52.0,67.0
50%,2154-01-19 13:42:00,14918516.0,2154-01-19 18:01:00,89.0,114.0,60.0,75.0
75%,2175-10-03 12:42:00,17509107.0,2175-10-03 21:45:00,102.0,130.0,69.0,86.0
max,2209-05-30 02:04:00,19983257.0,2209-05-30 14:00:00,217.0,329.0,290.0,299.0
std,,2886914.165744,,19.709705,21.900488,14.349584,15.897094


In [15]:
vitals_data.to_csv(config.out_dir / cohort_filename)