### ML Data pre-processing
This notebook is for loading and cleaning the data that will be used to train the ML on.
Things like patient heart rate and blood pressure readings that occurred around the time of the administration of the second dose 

It should persist the data into the "out" directory to be consumed by the ml training notebook

In [None]:
import pandas as pd
import root_config as rc

rc.configure()

from detectdd.auth_bigquery import BigQueryClient
from detectdd.serializer import Serializer

try:
    serializer = Serializer()
    cohort_with_icd = serializer.read_cohort()  # need to run 01-cohort.ipynb to produce the cohort
    print(len(cohort_with_icd))
    cohort_without_icd = serializer.read_cohort_with_no_icd()
    print(len(cohort_without_icd))
    cohort = pd.concat([cohort_with_icd, cohort_without_icd])
except FileNotFoundError:
    raise Exception("Need to run [01-cohort.ipynb] at least once to create the cohort file in the /out directory")


big_query = BigQueryClient.auth()
cohort.describe()

In [None]:
from detectdd.query_multiplexer import QueryMultiplexer, WhereClauseGenerator
from detectdd import config

icu = 'physionet-data.mimiciv_icu'

cs_mean_art_chart_itemids = {"PA mean pressure (PA Line)": 226857,
                                       "Arterial Blood Pressure mean": 220052,
                                       "Non Invasive Blood Pressure mean": 220181,
                                       "ART BP Mean": 225312}
query_multiplexer = QueryMultiplexer(BigQueryClient().auth())

def fetch_blood_pressure_data():
    
    sql_template = f"""SELECT ce.subject_id AS subject_id,
            ce.stay_id AS stay_id,
            ce.hadm_id AS hadm_id,
            ce.itemid AS item_id,
            ce.charttime AS chart_time,
            ce.valuenum AS bp,
            IF(ce.itemid IN {tuple(cs_mean_art_chart_itemids.values())}, "MEAN", "SYSTOLIC") AS abs_type,
        FROM `{icu}.chartevents` AS ce
        
        WHERE ce.itemid IN {tuple(cs_mean_art_chart_itemids.values())}                          --measures of mean arterial BP
        AND ($where)"""
    
    where_fragment = "(ce.stay_id= $stay_id AND DATETIME_DIFF(ce.charttime, '$dose_b_time', HOUR) BETWEEN 0 AND 12)"

    multimap_data = {k: v.tolist() for k, v in cohort.groupby('stay_id')['dose_b_time']}
    results = query_multiplexer.multiplex_query(sql_template, multi_map_data=multimap_data, where_clause=WhereClauseGenerator(where_fragment, "stay_id", "dose_b_time"))
    return results

results = fetch_blood_pressure_data()
results

In [None]:
type(results)

In [None]:
len(results)

In [None]:
results