In [2]:
import root_config as rc

rc.configure()

from detectdd.auth_bigquery import BigQueryClient
from detectdd.serializer import Serializer

try:
    cohort = Serializer().read_cohort()  # need to run 01-cohort.ipynb to produce the cohort
except FileNotFoundError:
    raise Exception("Need to run [01-cohort.ipynb] at least once to create the cohort file in the /out directory")


big_query = BigQueryClient.auth()

cohort.describe()

Loaded cohort from ..\out\cohort-full.out
<google.oauth2.credentials.Credentials object at 0x0000015CD94F7C70> assignment-1-395912


Unnamed: 0,subject_id,stay_id,drug_a_item_id,drug_b_item_id,dose_b_time,event_count
count,65814.0,65814.0,65814.0,65814.0,65814,65814.0
mean,14861841.983545,34935152.596742,225337.914061,223402.287659,2154-08-20 12:07:36.738081,7.232534
min,10002428.0,30000484.0,221347.0,221289.0,2110-01-19 21:15:00,1.0
25%,12350214.75,32259031.0,225855.0,221744.0,2134-03-24 07:48:00,1.0
50%,14767018.0,34928659.0,225869.0,221744.0,2155-03-14 18:23:00,3.0
75%,17362900.0,37550329.0,225892.0,225798.0,2174-09-03 21:18:00,8.0
max,19999840.0,39999230.0,229618.0,229233.0,2209-05-31 15:15:00,304.0
std,2903605.512713,2950984.617329,1422.498681,2057.698826,,16.848738


In [16]:
import pandas as pd
from detectdd.auth_bigquery import BigQueryClient

serializer = Serializer()
subject_ids = cohort['subject_id'].unique()

subject_id_str = ', '.join([str(id) for id in subject_ids])
print(subject_id_str)
big_query = BigQueryClient.auth()

query = f"""
SELECT subject_id, heart_rate, sbp, dbp, mbp, charttime
FROM `physionet-data.mimic_derived.vitalsign`
WHERE subject_id IN ({subject_id_str})
"""

vitals_data = big_query.query(query).to_dataframe()

vitals_data.to_csv('vitals_data_final.csv', index=False)


12750983, 15443439, 18638184, 11123887, 14808763, 14836177, 17694075, 15850072, 16345504, 13637699, 14257483, 19860377, 15239071, 13332600, 10245082, 14668032, 10314359, 13078497, 11069887, 12476775, 10828063, 14582949, 11230452, 12163850, 10017531, 12749568, 18588165, 14192456, 11475003, 13191989, 15198228, 10699336, 18561312, 19244907, 18409161, 11266689, 12605716, 19965625, 13188070, 15032392, 13830137, 10433099, 10947245, 18398510, 16232319, 16751749, 10840816, 19016592, 14338016, 14610595, 13386388, 11570162, 17080103, 19131048, 19997367, 15162030, 14066157, 11913668, 14480814, 10388400, 11012243, 18078692, 11076206, 13383411, 19299811, 19624089, 14923562, 15071011, 16237480, 19656279, 11941849, 10760364, 12527616, 14539683, 16391076, 11004856, 18196788, 19623193, 19858494, 12384098, 17905563, 18875034, 14841168, 12862321, 16790266, 18129598, 14113317, 13325402, 18787376, 17167982, 15439081, 10663695, 17233058, 19112585, 11267564, 19578299, 10846671, 13699799, 18509567, 14951133, 

In [None]:
import pandas as pd

# Load the cohort data
serializer = Serializer()
try:
    cohort = serializer.read_cohort()
except FileNotFoundError:
    raise Exception("Run 01-cohort.ipynb to generate the cohort data.")

# Load the vitals data from the CSV
vitals_data = pd.read_csv('vitals_data_final.csv')
vitals_data['charttime'] = pd.to_datetime(vitals_data['charttime'], format='%Y-%m-%d %H:%M:%S')


# Filter rows based on cohort dosage times
filtered_vitals_data = pd.DataFrame()

for _, row in cohort.iterrows():
    subject_id = row['subject_id']
    dose_time = pd.to_datetime(row['dose_b_time'])
    start_time = dose_time - pd.Timedelta(hours=12)
    end_time = dose_time + pd.Timedelta(hours=12)

    filtered_rows = vitals_data[
        (vitals_data['subject_id'] == subject_id) &
        (vitals_data['charttime'] >= start_time) &
        (vitals_data['charttime'] <= end_time)
        ]

    filtered_vitals_data = pd.concat([filtered_vitals_data, filtered_rows])

filtered_vitals_data.reset_index(drop=True, inplace=True)
filtered_vitals_data.drop_duplicates(inplace=True)
filtered_vitals_data

Loaded cohort from ..\out\cohort-full.out


Unnamed: 0,subject_id,heart_rate,sbp,dbp,mbp,charttime
0,12750983,,,,,2176-10-27 17:00:00
1,12750983,81.0,92.0,54.0,68.0,2176-10-28 03:00:00
2,12750983,82.0,89.0,57.0,70.0,2176-10-28 02:00:00
3,12750983,88.0,95.0,64.0,76.0,2176-10-27 19:00:00
4,12750983,79.0,101.0,65.0,80.0,2176-10-28 00:00:00
...,...,...,...,...,...,...
2428295,16389191,70.0,,,,2178-06-12 22:00:00
2428296,16389191,60.0,111.0,57.0,72.0,2178-06-13 04:00:00
2428297,16389191,61.0,,,,2178-06-13 03:00:00
2428298,16389191,62.0,,,,2178-06-13 06:00:00


In [5]:
# Initializing lists to store baseline and post-admin data
baseline_heart_rates, baseline_sbps, baseline_dbps, baseline_mbps = [], [], [], []
post_heart_rates, post_sbps, post_dbps, post_mbps = [], [], [], []
subject_ids = []

for _, row in cohort.iterrows():
    subject_id = row['subject_id']
    dose_time = pd.to_datetime(row['dose_b_time'])
    start_time = dose_time - pd.Timedelta(hours=12)
    end_time = dose_time + pd.Timedelta(hours=12)

    baseline_data = filtered_vitals_data[
        (filtered_vitals_data['subject_id'] == subject_id) &
        (filtered_vitals_data['charttime'] < dose_time)
        ]

    post_data = filtered_vitals_data[
        (filtered_vitals_data['subject_id'] == subject_id) &
        (filtered_vitals_data['charttime'] > dose_time)
        ]

    # Append baseline and post-admin data for each vital
    baseline_heart_rates.append(baseline_data['heart_rate'].mean())
    baseline_sbps.append(baseline_data['sbp'].mean())
    baseline_dbps.append(baseline_data['dbp'].mean())
    baseline_mbps.append(baseline_data['mbp'].mean())

    post_heart_rates.append(post_data['heart_rate'].mean())
    post_sbps.append(post_data['sbp'].mean())
    post_dbps.append(post_data['dbp'].mean())
    post_mbps.append(post_data['mbp'].mean())

    subject_ids.append(subject_id)

# Create a dataframe from the lists
merged_data = pd.DataFrame({
    'subject_id': subject_ids,
    'baseline_heart_rate': baseline_heart_rates,
    'baseline_sbp': baseline_sbps,
    'baseline_dbp': baseline_dbps,
    'baseline_mbp': baseline_mbps,
    'post_heart_rate': post_heart_rates,
    'post_sbp': post_sbps,
    'post_dbp': post_dbps,
    'post_mbp': post_mbps
})

merged_data


Unnamed: 0,subject_id,baseline_heart_rate,baseline_sbp,baseline_dbp,baseline_mbp,post_heart_rate,post_sbp,post_dbp,post_mbp
0,12750983,80.342380,107.215000,60.046000,75.025948,85.082517,109.792466,61.369178,76.629630
1,15443439,73.941558,133.126033,53.851240,79.674847,71.700000,137.521739,53.413043,81.184783
2,15443439,74.691211,132.160271,53.722348,79.174107,69.962121,139.373134,53.947761,82.388060
3,18638184,103.453782,116.866935,53.387097,76.069106,116.777778,131.400000,55.485714,73.314286
4,11123887,94.717042,104.763240,48.711180,64.917445,98.198653,100.375415,48.265000,63.845000
...,...,...,...,...,...,...,...,...,...
65809,11106897,56.166667,119.000000,46.461538,62.423077,65.855856,120.594828,53.698276,70.577586
65810,15566276,96.312500,159.333333,74.800000,98.266667,83.696000,128.365079,60.746032,78.984064
65811,16389191,62.852941,116.810606,64.757576,80.022727,61.500000,128.500000,70.750000,87.250000
65812,16389191,61.000000,117.085366,62.658537,78.475610,65.413793,118.034483,68.551724,83.206897


In [6]:
from scipy.stats import ttest_rel
cleaned_data = merged_data.dropna(subset=['baseline_heart_rate', 'post_heart_rate', 'baseline_sbp', 'post_sbp', 'baseline_dbp', 'post_dbp', 'baseline_mbp', 'post_mbp'])

t_stat_heart_rate, p_val_heart_rate = ttest_rel(cleaned_data['baseline_heart_rate'], cleaned_data['post_heart_rate'])
t_stat_sbp, p_val_sbp = ttest_rel(cleaned_data['baseline_sbp'], cleaned_data['post_sbp'])
t_stat_dbp, p_val_dbp = ttest_rel(cleaned_data['baseline_dbp'], cleaned_data['post_dbp'])
t_stat_mbp, p_val_mbp = ttest_rel(cleaned_data['baseline_mbp'], cleaned_data['post_mbp'])

print(f"Heart Rate: t-statistic = {t_stat_heart_rate}, p-value = {p_val_heart_rate}")
print(f"SBP: t-statistic = {t_stat_sbp}, p-value = {p_val_sbp}")
print(f"DBP: t-statistic = {t_stat_dbp}, p-value = {p_val_dbp}")
print(f"MBP: t-statistic = {t_stat_mbp}, p-value = {p_val_mbp}")

Heart Rate: t-statistic = 27.73648775919738, p-value = 2.4580651936237734e-168
SBP: t-statistic = -46.316805226230386, p-value = 0.0
DBP: t-statistic = -23.957549372660136, p-value = 2.7393604222373467e-126
MBP: t-statistic = -27.10061596617327, p-value = 7.681814980905664e-161


In [12]:
import os
#os.remove('vitals_data_final.csv')

# ********************* Do not Run *********************

## ********************* Do not Run ********************************** ##

import pandas as pd
from detectdd.auth_bigquery import BigQueryClient

# Load the cohort data
serializer = Serializer()
try:
    cohort = serializer.read_cohort()  # assuming your cohort data includes dosage times
except FileNotFoundError:
    raise Exception("Run 01-cohort.ipynb to generate the cohort data.")

# Initialize BigQuery client
big_query = BigQueryClient.auth()

# Create an empty DataFrame to store the merged data
all_vitals_data = pd.DataFrame()

# Iterate over each row in the cohort to fetch vitals data for each dosage event
for index, row in cohort.iterrows():
    subject_id = row['subject_id']
    stay_id = row['stay_id']
    dose_time = row['dose_b_time']

    # SQL query to fetch vitals before and after the drug dosage, within a certain time window
    # SQL query to fetch vitals before and after the drug dosage, within a certain time window
    query = f"""
    SELECT subject_id, stay_id, charttime, heart_rate, sbp, dbp, mbp
    FROM `physionet-data.mimic_derived.vitalsign`
    WHERE subject_id = {subject_id} AND stay_id = {stay_id}
        AND TIMESTAMP(charttime) >= TIMESTAMP_SUB(TIMESTAMP '{dose_time}', INTERVAL 2 HOUR)
        AND TIMESTAMP(charttime) <= TIMESTAMP_ADD(TIMESTAMP '{dose_time}', INTERVAL 2 HOUR)
    ORDER BY charttime
    """


    # Execute the query
    vitals_data = big_query.query(query).to_dataframe()

    # Add additional info about dosage time
    vitals_data['dose_time'] = dose_time

    # Append this data to the overall DataFrame
    all_vitals_data = pd.concat([all_vitals_data, vitals_data], ignore_index=True)


all_vitals_data.to_csv('all_vitals_time_series.csv', index=False)

## ********************* Do not Run ********************************** ##

import pandas as pd
from detectdd.auth_bigquery import BigQueryClient
from detectdd.query_multiplexer import QueryMultiplexer, WhereClauseGenerator

# Load the cohort data
serializer = Serializer()
try:
    cohort = serializer.read_cohort()  # assuming your cohort data includes dosage times
except FileNotFoundError:
    raise Exception("Run 01-cohort.ipynb to generate the cohort data.")

query_multiplexer = QueryMultiplexer(BigQueryClient().auth())

sql_template = """
    SELECT subject_id, stay_id, charttime, heart_rate, sbp, dbp, mbp
    FROM `physionet-data.mimic_derived.vitalsign`
    WHERE $where
    ORDER BY charttime
"""

where_fragment = "(subject_id = $subject_id AND stay_id = $stay_id AND TIMESTAMP(charttime) >= TIMESTAMP_SUB(TIMESTAMP '$dose_time', INTERVAL 2 HOUR) AND TIMESTAMP(charttime) <= TIMESTAMP_ADD(TIMESTAMP '$dose_time', INTERVAL 2 HOUR))"

where_clause_gen = WhereClauseGenerator(where_fragment, "dose_b_time", ["subject_id", "stay_id", "dose_time"])

multimap_data = {k: v.tolist() for k, v in cohort.groupby(['subject_id', 'stay_id'])['dose_b_time']}
results = query_multiplexer.multiplex_query(sql_template, multi_map_data=multimap_data, where_clause=where_clause_gen)

# Convert results to a DataFrame
all_vitals_data = pd.DataFrame(results)

# Write to CSV
all_vitals_data.to_csv('all_vitals_time_series.csv', index=False)

## ********************* Do not Run ********************************** ##

detectdd.auth_bigquery import BigQueryClient
from detectdd.query_multiplexer import QueryMultiplexer, WhereClauseGenerator

# Load the cohort data
serializer = Serializer()
try:
    cohort = serializer.read_cohort()
except FileNotFoundError:
    raise Exception("Run 01-cohort.ipynb to generate the cohort data.")

query_multiplexer = QueryMultiplexer(BigQueryClient().auth())

# SQL template for vitals data
sql_template = f"""
SELECT subject_id, charttime, heart_rate, sbp, dbp, mbp
FROM `physionet-data.mimic_derived.vitalsign`
WHERE ($where)
ORDER BY charttime
"""

# Define the WHERE clause fragment for the multiplexer
where_fragment = """
(subject_id = $subject_id AND charttime BETWEEN DATETIME_SUB(DATETIME "$dose_b_time", INTERVAL 12 HOUR) 
AND DATETIME_ADD(DATETIME "$dose_b_time", INTERVAL 12 HOUR))
"""

# Prepare a multimap_data dictionary
multimap_data = {k: v.tolist() for k, v in cohort.groupby('subject_id')['dose_b_time']}

# Split the multimap_data into batches (e.g., of size 100)
batch_size = 100
keys = list(multimap_data.keys())
vitals_data_all = pd.DataFrame()

for i in range(0, len(keys), batch_size):
    batch_keys = keys[i:i+batch_size]
    batch_data = {k: multimap_data[k] for k in batch_keys}

    batch_results = query_multiplexer.multiplex_query(sql_template, multi_map_data=batch_data, where_clause=WhereClauseGenerator(where_fragment, "subject_id", "dose_b_time"))
    vitals_data_all = pd.concat([vitals_data_all, batch_results])

vitals_data_all