### This is a copu of ml-data-preprocessing, written to help with scaffolding for the query multiplexer.
### 02-ml-data-preprocessing is still the main preprocessing file

### ML Data pre-processing
This notebook is for loading and cleaning the data that will be used to train the ML on.
Things like patient heart rate and blood pressure readings that occurred around the time of the administration of the second dose 

It should persist the data into the "out" directory to be consumed by the ml training notebook

In [7]:
import root_config as rc
from detectdd import config
import pandas as pd

rc.configure()

from detectdd.auth_bigquery import BigQueryClient
from detectdd.serializer import Serializer

print("Loading cohort")

try:
    serializer = Serializer()
    cohort_with_icd = serializer.read_cohort()  # need to run 01-cohort.ipynb to produce the cohort
    print(cohort_with_icd.describe())
    print(len(cohort_with_icd))
    cohort_without_icd = serializer.read_cohort_with_no_icd()
    print(len(cohort_without_icd))
    cohort = pd.concat([cohort_with_icd.head(10000), cohort_without_icd.head(10000)])
except FileNotFoundError:
    raise Exception("Need to run [01-cohort.ipynb] at least once to create the cohort file in the /out directory")

big_query = BigQueryClient.auth()


from detectdd.query_multiplexer import WhereClauseGenerator
from detectdd.query_multiplexer import QueryMultiplexer
import pandas as pd
from detectdd.auth_bigquery import BigQueryClient

cohort_with_no_ddi = pd.read_csv(config.out_dir / 'non-drug-interactions.csv')

cohort_with_no_ddi["dose_b_time"] = cohort_with_no_ddi["dose_b_time"].astype("datetime64[s]")

cohort_with_no_ddi.nunique()

# fetch this data set
data_cohort=cohort_with_icd
cohort_filename = "vitals_data_before_and_after.csv"

#data_cohort=cohort_with_no_ddi
#cohort_filename = "vitals_data_before_and_after_no_drug_interaction.csv"

Loading cohort
Loaded cohort from ..\out\cohort-full.out
            subject_id          hadm_id          stay_id  \
count          13051.0          13051.0          13051.0   
mean   14897172.711363  25209949.443951  34898688.696881   
min         10007795.0       20025078.0       30004144.0   
25%         12410891.0       22708248.5       32484982.0   
50%         14881763.0       25332191.0       34898310.0   
75%         17399295.0       27761005.0       37248820.0   
max         19995127.0       29996046.0       39986786.0   
std     2865019.433527     2898130.8584    2816369.37509   

                      dose_b_time  event_count  num_icd_codes  
count                       13051      13051.0        13051.0  
mean   2154-04-29 07:20:28.462187     3.130105       1.368631  
min           2110-02-10 06:30:00          0.0            1.0  
25%           2133-10-12 15:41:00          1.0            1.0  
50%           2154-01-30 06:00:00          2.0            1.0  
75%           2175

In [8]:

# Assuming you have a Serializer class that handles reading your saved cohort data
serializer = Serializer()

# Extract unique subject_ids from the cohort data
subject_ids = data_cohort['subject_id'].unique()

# Convert the list of subject_ids to a format suitable for SQL query
subject_id_str = ', '.join([str(id) for id in subject_ids])
# print(subject_id_str)
# Now, let's proceed to fetch the vital signs for these subject_ids from MIMIC

query_multiplexer = QueryMultiplexer(big_query)

# Write a SQL query to fetch the required vitals where the subject_ids are in your cohort
query = """
SELECT stay_id, subject_id, charttime, heart_rate, sbp, dbp, mbp
FROM `physionet-data.mimiciv_derived.vitalsign`
WHERE ($where) 
    AND (heart_rate IS NOT NULL OR sbp IS NOT NULL OR dbp IS NOT NULL OR mbp IS NOT NULL)
"""

# query = f"""
# SELECT subject_id, heart_rate, sbp, dbp, mbp
# FROM `physionet-data.mimiciv_derived.vitalsign`
# WHERE subject_id IN ({subject_id_str}) limit 100"""

where_fragment = "(stay_id= $stay_id AND charttime > DATETIME_ADD('$dose_b_time', INTERVAL -720 MINUTE) AND charttime < DATETIME_ADD('$dose_b_time', INTERVAL 720 MINUTE))"

multimap_data = {k: v.tolist() for k, v in data_cohort.groupby('stay_id')['dose_b_time']}
results = query_multiplexer.multiplex_query(query, multi_map_data=multimap_data,
                                            where_clause=WhereClauseGenerator(where_fragment, "stay_id", "dose_b_time"))

Executing query 1, with 5032 pairs at 2023-10-31 14:37:16.133567
Partitioning key value pairs 5032
Number of partitions 6 with partition_size 838.6666666666666
Got result with 26942 values
Got result with 26868 values
Got result with 26415 values
Got result with 26378 values
Got result with 26163 values
Got result with 26874 values
Executing query 2, with 3956 pairs at 2023-10-31 14:46:44.671047
Partitioning key value pairs 3956
Number of partitions 6 with partition_size 659.3333333333334
Got result with 21280 values
Got result with 20874 values
Got result with 20634 values
Got result with 20748 values
Got result with 21058 values
Got result with 21364 values
Executing query 3, with 2932 pairs at 2023-10-31 14:53:29.828329
Partitioning key value pairs 2932
Number of partitions 6 with partition_size 488.6666666666667
Got result with 15701 values
Got result with 15625 values
Got result with 15836 values
Got result with 15496 values
Got result with 15512 values
Got result with 15921 value

In [9]:
# Run the query
vitals_data = results
vitals_data.describe()

Unnamed: 0,dose_b_time,subject_id,charttime,heart_rate,sbp,dbp,mbp
count,645016,645016.0,645016,484306.0,484670.0,484611.0,484548.0
mean,2154-10-22 22:26:14.314991616,14959710.879011,2154-10-22 22:27:30.666217,88.689389,115.707828,61.483698,76.540531
min,2110-02-10 06:30:00,10003400.0,2110-02-09 22:10:00,1.0,11.0,2.0,1.0
25%,2134-10-08 07:30:00,12465457.0,2134-10-08 18:45:00,75.0,100.0,51.0,66.0
50%,2154-08-31 23:13:00,14993854.0,2154-09-01 00:02:30,88.0,113.0,60.0,74.0
75%,2175-05-28 20:09:00,17444849.0,2175-05-29 00:40:00,100.0,129.0,70.0,85.0
max,2209-05-30 09:27:00,19999828.0,2209-05-30 17:02:00,229.0,328.0,279.0,299.0
std,,2861290.591336,,18.82596,21.872007,14.721063,15.996281


In [11]:
vitals_data.to_csv(config.out_dir / cohort_filename)

In [13]:
pd.read_csv(config.out_dir /"vitals_data_no_drug_interaction.csv").to_csv(config.out_dir / "vitals_data_before_and_after_no_drug_interaction.csv")