In [None]:
import h5py
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta
import h5py
import pickle
import matplotlib.pyplot as plt
from edm.utils.waveforms import WAVEFORMS_OF_INTERST, get_waveform
from tqdm import tqdm

pd.set_option('display.max_columns', None)

## Load Data

In [None]:
df = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/visits_2020_08_01_2022_04_30.csv")
df = df.drop(["Name", "MRN", "Birth_date"], axis=1)
print(df.shape)
df.head(3)

In [None]:
# Note that this dataset originally came from a dataset that was previously matched so 
# we can do the following column mapping.
df["First_bed"] = df["First_room"]
df["Last_bed"] = df["First_room"]

df.to_csv("/deep/group/ed-monitor-self-supervised/v4/visits_2020_08_01_2022_04_30.csv", index=False)

## Perform Matching

**This file above was then sent through the match.py script to produced a subset containing matched beds**

```
python /deep/u/tomjin/ed-monitor-data/processing/match.py -ci /deep/group/ed-monitor-self-supervised/v4/visits_2020_08_01_2022_04_30.csv -ef /deep/group/ed-monitor/2020_08_23_2020_09_23,/deep/group/ed-monitor/2020_09_23_2020_11_30,/deep/group/ed-monitor/2020_11_30_2020_12_31,/deep/group/ed-monitor/2021_01_01_2021_01_31,/deep/group/ed-monitor/2021_02_01_2021_02_28,/deep/group/ed-monitor/2021_03_01_2021_03_31,/deep/group/ed-monitor/2021_04_01_2021_05_12,/deep/group/ed-monitor/2021_05_13_2021_05_31,/deep/group/ed-monitor/2021_06_01_2021_06_30,/deep/group/ed-monitor/2021_07_01_2021_07_31,/deep/group/ed-monitor/2021_08_01_2021_09_16,/deep/group/ed-monitor/2021_09_17_2021_10_31,/deep/group/ed-monitor/2021_11_01_2021_12_28,/deep/group/ed-monitor/2021_12_29_2022_02_28,/deep/group/ed-monitor/2022_03_01_2022_04_26 -co /deep/group/ed-monitor-self-supervised/v4/matched-cohort.csv -eo /deep/group/ed-monitor-self-supervised/v4/matched-export.csv
```

```
...
Total export file combined: (73869, 26)
After removing beds with bad times: (73328, 26)
After removing duplicate studies: (70855, 27)
Starting matching process...
101126it [08:27, 199.41it/s]
A total of 90195 cases have matching beds
After removing non-matching studies: (90195, 33)
Size: 90195 Pos: 0; Eliminated 10931 and pos 0
======
Files written to:
- /deep/group/ed-monitor-self-supervised/v4/matched-cohort.csv
- /deep/group/ed-monitor-self-supervised/v4/matched-export.csv
Done



```

In [None]:
df_cohort = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/matched-cohort.csv")
print(df_cohort.shape)
df_cohort.head(3)


In [None]:
df_cohort['time'] = df_cohort.apply(lambda row: datetime.strptime(row["Arrival_time"], "%Y-%m-%dT%H:%M:%SZ"), axis=1)
df_cohort.head(2)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 3))
fig.tight_layout()
plt.hist(df_cohort["time"], 50)
plt.show()

## Prepare ED Numerics

This is similar to Consolidate, but only for the numerics data and not the waveforms. For this, we will just continue off of where we left off for the V3 dataset.

This is needed for David, since he is looking for the CSV files for the numeric datasets.


```
python -u /deep/u/tomjin/ed-monitor-data/processing/prepare_ed_numerics_from_matched_cohort.py -i /deep/group/ed-monitor-self-supervised/v4/matched-cohort.csv -d /deep/group/ed-monitor/2020_08_23_2020_09_23,/deep/group/ed-monitor/2020_09_23_2020_11_30,/deep/group/ed-monitor/2020_11_30_2020_12_31,/deep/group/ed-monitor/2021_01_01_2021_01_31,/deep/group/ed-monitor/2021_02_01_2021_02_28,/deep/group/ed-monitor/2021_03_01_2021_03_31,/deep/group/ed-monitor/2021_04_01_2021_05_12,/deep/group/ed-monitor/2021_05_13_2021_05_31,/deep/group/ed-monitor/2021_06_01_2021_06_30,/deep/group/ed-monitor/2021_07_01_2021_07_31,/deep/group/ed-monitor/2021_08_01_2021_09_16,/deep/group/ed-monitor/2021_09_17_2021_10_31,/deep/group/ed-monitor/2021_11_01_2021_12_28,/deep/group/ed-monitor/2021_12_29_2022_02_28,/deep/group/ed-monitor/2022_03_01_2022_04_26 -o /deep/group/physiologic-states/v3/processed -s "2021-12-27T00:00:00Z"
```

`submit_summarize_numerics.sh`

Creates the summary.csv file

```
python -u /deep/u/tomjin/ed-monitor-data/processing/summarize_ed_numerics.py -i /deep/group/physiologic-states/v3/processed
```


`submit_process_to_csv.sh`

Converts PKL to CSV files for David.

```
python -u /deep/u/tomjin/ed-monitor-data/processing/process_ed_numerics_to_csv.py -i /deep/group/physiologic-states/v3/processed -o /deep/group/physiologic-states/v3/csv
```

In [None]:
!wc -l /deep/group/physiologic-states/v3/csv/summary.csv

## Consolidate

Goes through each patient and consolidates the numeric data and waveforms based on the start/end times of each patient.

```
python -u /deep/u/tomjin/ed-monitor-data/processing/consolidate_numerics_waveforms.py -m /deep/group/ed-monitor-self-supervised/v4/matched-cohort.csv -e /deep/group/ed-monitor-self-supervised/v4/matched-export.csv -o /deep/group/ed-monitor-self-supervised/v4/patient-data -f /deep/group/ed-monitor-self-supervised/v4/consolidated.csv -c /deep/group/ed-monitor-self-supervised/v4/consolidated.csv.bak
```

Above command timed out, so we restart the command again.

### Split Matched Cohort File

In [None]:
# Split file into three pieces, since it is really slow
#
df = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/matched-cohort.csv")
print(df.shape)
df.head(2)

In [None]:
df_1 = df.iloc[:30000, :]
df_2 = df.iloc[30000:60000, :]
df_3 = df.iloc[60000:, :]

In [None]:
print(df_1.shape)
print(df_2.shape)
print(df_3.shape)

In [None]:
df_1.to_csv("/deep/group/ed-monitor-self-supervised/v4/matched-cohort.1.csv", index=False)
df_2.to_csv("/deep/group/ed-monitor-self-supervised/v4/matched-cohort.2.csv", index=False)
df_3.to_csv("/deep/group/ed-monitor-self-supervised/v4/matched-cohort.3.csv", index=False)


### Run the consolidation script...


```
python -u /deep/u/tomjin/ed-monitor-data/processing/consolidate_numerics_waveforms.py -m /deep/group/ed-monitor-self-supervised/v4/matched-cohort.1.csv -e /deep/group/ed-monitor-self-supervised/v4/matched-export.csv -o /deep/group/ed-monitor-self-supervised/v4/patient-data -f /deep/group/ed-monitor-self-supervised/v4/consolidated.1.csv -c /deep/group/ed-monitor-self-supervised/v4/consolidated.csv.bak
```

```
python -u /deep/u/tomjin/ed-monitor-data/processing/consolidate_numerics_waveforms.py -m /deep/group/ed-monitor-self-supervised/v4/matched-cohort.2.csv -e /deep/group/ed-monitor-self-supervised/v4/matched-export.csv -o /deep/group/ed-monitor-self-supervised/v4/patient-data -f /deep/group/ed-monitor-self-supervised/v4/consolidated.2.csv -c /deep/group/ed-monitor-self-supervised/v4/consolidated.csv.bak
```

```
python -u /deep/u/tomjin/ed-monitor-data/processing/consolidate_numerics_waveforms.py -m /deep/group/ed-monitor-self-supervised/v4/matched-cohort.3.csv -e /deep/group/ed-monitor-self-supervised/v4/matched-export.csv -o /deep/group/ed-monitor-self-supervised/v4/patient-data -f /deep/group/ed-monitor-self-supervised/v4/consolidated.3.csv -c /deep/group/ed-monitor-self-supervised/v4/consolidated.csv.bak
```

### Rejoin The Consolidation File

In [None]:
df_1 = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/consolidated.1.csv")
print(df_1.shape)
df_2 = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/consolidated.2.csv")
print(df_2.shape)
df_3 = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/consolidated.3.csv")
print(df_3.shape)

In [None]:
df = pd.concat([df_1, df_2, df_3])
print(df.shape)
df.head(2)

In [None]:
df = df.drop_duplicates('patient_id')
print(df.shape)
df.head(2)

In [None]:
df.to_csv("/deep/group/ed-monitor-self-supervised/v4/consolidated.csv", index=False)

### Discover Alignment Times

In [None]:
pre_minutes_min = 15

output_rows = []

for i, row in tqdm(df.iterrows()):
    csn = row["patient_id"]
    waveform_start = row["waveform_start_time"]
    if isinstance(waveform_start, float):
        continue
    waveform_start = datetime.strptime(waveform_start, '%Y-%m-%d %H:%M:%S%z')
    waveform_end = row["waveform_end_time"]
    if isinstance(waveform_end, float):
        continue
    waveform_end = datetime.strptime(waveform_end, '%Y-%m-%d %H:%M:%S%z')

    recommended_trim_start_sec = int(row["recommended_trim_start_sec"])
    recommended_trim_end_sec = int(row["recommended_trim_end_sec"])

    start_time = waveform_start + timedelta(seconds=(recommended_trim_start_sec))
    alignment_time = start_time + timedelta(seconds=(pre_minutes_min * 60))
    csn_to_alignment_time[csn] = alignment_time
    
    alignment_time_timestamp = int(alignment_time.timestamp())
    output_rows.append([csn, alignment_time_timestamp, alignment_time.isoformat()])


In [None]:
df_alignment = pd.DataFrame(output_rows, columns =['csn', 'alignment_time', 'alignment_dt'])
print(df_alignment.shape)
df_alignment.head(2)

In [None]:
df_alignment.to_csv("/deep/group/ed-monitor-self-supervised/v4/consolidated.15min.alignment_times.csv", index=False)

In [None]:
b["HR"][:10]

## 15 min / 60 min - (60 sec waveforms)

### Generate Dataset

In [None]:
df_consolid = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/consolidated.csv")
print(df_consolid.shape)
df_consolid.head(2)

In [None]:
df_ssl = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/visits_ssl_2022_05_23.csv")
print(df_ssl.shape)
df_ssl.head(2)

valid_patient_ids = set(df_ssl["CSN"].tolist())
print(len(valid_patient_ids))

We find that many IDs were removed because not all modality data was available...

In [None]:

orig_patient_ids = set(df_consolid["patient_id"].tolist())
print(len(orig_patient_ids))
removed_patient_ids = orig_patient_ids - valid_patient_ids
print(len(removed_patient_ids))


In [None]:
df_consolid = df_consolid[df_consolid["patient_id"].isin(valid_patient_ids)]
print(df_consolid.shape)

In [None]:
df_consolid.to_csv("/deep/group/ed-monitor-self-supervised/v4/consolidated.visits_ssl_2022_05_23.csv", index=False)

In [None]:
# Divide into two pieces for faster processing

df_consolid_1 = df_consolid.iloc[:20000, :]
df_consolid_2 = df_consolid.iloc[20000:, :]

print(df_consolid_1.shape)
print(df_consolid_2.shape)

In [None]:
df_consolid_1.to_csv("/deep/group/ed-monitor-self-supervised/v4/consolidated.visits_ssl_2022_05_23.1.csv", index=False)
df_consolid_2.to_csv("/deep/group/ed-monitor-self-supervised/v4/consolidated.visits_ssl_2022_05_23.2.csv", index=False)

### Run Script
Takes roughly 24 hours

 
```
python -u /deep/u/tomjin/ed-monitor-self-supervised/preprocessing/generate_downstream_dataset.py --input-dir /deep/group/ed-monitor-self-supervised/v4/patient-data --input-file /deep/group/ed-monitor-self-supervised/v4/consolidated.visits_ssl_2022_05_23.1.csv --output-data-file /deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.1.h5 --output-summary-file /deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.1.csv --pre-minutes 15 --post-minutes 60 --waveform-length 60
```

 
```
python -u /deep/u/tomjin/ed-monitor-self-supervised/preprocessing/generate_downstream_dataset.py --input-dir /deep/group/ed-monitor-self-supervised/v4/patient-data --input-file /deep/group/ed-monitor-self-supervised/v4/consolidated.visits_ssl_2022_05_23.2.csv --output-data-file /deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.2.h5 --output-summary-file /deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.2.csv --pre-minutes 15 --post-minutes 60 --waveform-length 60
```

### Rejoin Output

In [None]:
df_downstream_1 = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.1.csv")
print(df_downstream_1.shape)
df_downstream_1.head(2)

In [None]:
df_downstream_2 = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.2.csv")
print(df_downstream_2.shape)
df_downstream_2.head(2)

In [None]:
df_downstream = pd.concat([df_downstream_1, df_downstream_2])
print(df_downstream.shape)

In [None]:
df_downstream.to_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.csv", index=False)

In [None]:
with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.1.h5", "r") as f:
    print(f.keys())
    print(f['alignment_times'].shape)
    print(f['numerics_after'].keys())
    print(f['numerics_before'].keys())
    print(f['waveforms'].keys())
    print(f['waveforms']['II'].keys())
    print(f['waveforms']['II']['waveforms'].shape)

In [None]:
TYPE_II = "II"
TYPE_PLETH = "Pleth"
WAVEFORM_COLUMNS = [TYPE_II, TYPE_PLETH]
NUMERIC_COLUMNS = ['HR', 'RR', 'SpO2', 'btbRRInt_ms', 'NBPs', 'NBPd', 'Perf']

with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.1.h5", "r") as f1:
    with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.2.h5", "r") as f2:
        with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.h5", "w") as f:
            f.create_dataset("alignment_times", data=np.concatenate((f1['alignment_times'][:], f2['alignment_times'][:]), axis=0))
            f.create_dataset("alignment_vals", data=np.concatenate((f1['alignment_vals'][:], f2['alignment_vals'][:]), axis=0))
            dset_before = f.create_group("numerics_before")
            dset_after = f.create_group("numerics_after")
            for k in NUMERIC_COLUMNS:
                dset_k = dset_before.create_group(k)
                dset_k.create_dataset(f"vals", data=np.concatenate((f1["numerics_before"][k]["vals"][:], f2["numerics_before"][k]["vals"][:]), axis=0))
                dset_k.create_dataset(f"times", data=np.concatenate((f1["numerics_before"][k]["times"][:], f2["numerics_before"][k]["times"][:]), axis=0))
                dset_k = dset_after.create_group(k)
                dset_k.create_dataset(f"vals", data=np.concatenate((f1["numerics_after"][k]["vals"][:], f2["numerics_after"][k]["vals"][:]), axis=0))
                dset_k.create_dataset(f"times", data=np.concatenate((f1["numerics_after"][k]["times"][:], f2["numerics_after"][k]["times"][:]), axis=0))

            dset = f.create_group("waveforms")
            for k in WAVEFORM_COLUMNS:
                dset_k = dset.create_group(k)
                dset_k.create_dataset("waveforms", data=np.concatenate((f1["waveforms"][k]["waveforms"][:], f2["waveforms"][k]["waveforms"][:]), axis=0))
                dset_k.create_dataset("qualities", data=np.concatenate((f1["waveforms"][k]["qualities"][:], f2["waveforms"][k]["qualities"][:]), axis=0))


In [None]:
with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.h5", "r") as f:
    print(f.keys())
    print(f['alignment_times'].shape)
    print(f['numerics_after'].keys())
    print(f['numerics_before'].keys())
    print(f['numerics_after']['HR']['vals'].shape)
    print(f['numerics_before']['HR']['vals'].shape)
    print(f['waveforms'].keys())
    print(f['waveforms']['II'].keys())
    print(f['waveforms']['II']['waveforms'].shape)

### Sanity Plots

In [None]:
filename = "/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.h5"
with h5py.File(filename, "r") as f:
    print(f.get('waveforms')["II"]["waveforms"][()].shape)
    print(f.get('waveforms')["Pleth"]["waveforms"][()].shape)

In [None]:
filename = "/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.h5"
with h5py.File(filename, "r") as f:
    ecg_waveforms = f.get('waveforms')["II"]["waveforms"][()]
    pleth_waveforms = f.get('waveforms')["Pleth"]["waveforms"][()]
    plt.rcParams.update({'font.size': 8})
    a4_dims = (12, 3)
    fig, (ax1, ax2) = plt.subplots(2, figsize=a4_dims)
    ax1.plot(ecg_waveforms[1])
    ax2.plot(pleth_waveforms[1])
    plt.show()

In [None]:
filename = "/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.h5"
with h5py.File(filename, "r") as f:
    a4_dims = (12, 1.5)
    fig, (ax1) = plt.subplots(1, figsize=a4_dims)

    vals = f.get('numerics_after')["HR"]["vals"][()][15]

    ax1.plot(vals * 1.1)
    ax1.plot(range(len(vals)), np.ones(len(vals)) * 110, 'k-', color="orange")
    ax1.fill_between(range(len(vals)), np.ones(len(vals)) * 110, np.ones(len(vals)) * 150, alpha=0.4, label='Tachycardia', color="orange")
    ax1.set_xticks([])
    plt.show()

In [None]:
filename = "/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.h5"
with h5py.File(filename, "r") as f:
    
    ind = 0
    a4_dims = (12, 6)
    fig, (ax1, ax2, ax3, ax4, ax5, ax6) = plt.subplots(6, figsize=a4_dims)
    ax1.plot(f.get('numerics_before')["HR"]["vals"][()][ind])
    ax2.plot(f.get('numerics_before')["RR"]["vals"][()][ind])
    ax3.plot(f.get('numerics_before')["SpO2"]["vals"][()][ind])
    ax4.plot(f.get('numerics_before')["btbRRInt_ms"]["vals"][()][ind])
    ax5.plot((2 * f.get('numerics_before')["NBPd"]["vals"][()][10] + f.get('numerics_before')["NBPs"]["vals"][()][10]) / 3)
    ax6.plot(f.get('numerics_before')["Perf"]["vals"][()][ind])
    ax1.set_xticks([])
    ax2.set_xticks([])
    ax3.set_xticks([])
    ax4.set_xticks([])
    ax5.set_xticks([])
    ax6.set_xticks([])
    plt.show()
    
#     vals = f.get('numerics_after')["Perf"]["vals"][()][15]

#     ax1.plot(vals * 1.1)
#     ax1.set_xticks([])
#     plt.show()

## 15 min / 90 min - (60 sec waveforms)

### Rejoin Output

In [None]:
df_downstream_1 = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.90min.60sec.1.csv")
print(df_downstream_1.shape)
df_downstream_1.head(2)

In [None]:
df_downstream_2 = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.90min.60sec.2.csv")
print(df_downstream_2.shape)
df_downstream_2.head(2)

In [None]:
df_downstream = pd.concat([df_downstream_1, df_downstream_2])
print(df_downstream.shape)

In [None]:
df_downstream.to_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.90min.60sec.csv", index=False)

In [None]:
TYPE_II = "II"
TYPE_PLETH = "Pleth"
WAVEFORM_COLUMNS = [TYPE_II, TYPE_PLETH]
NUMERIC_COLUMNS = ['HR', 'RR', 'SpO2', 'btbRRInt_ms', 'NBPs', 'NBPd', 'Perf']

with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.90min.60sec.1.h5", "r") as f1:
    with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.90min.60sec.2.h5", "r") as f2:
        with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.90min.60sec.h5", "w") as f:
            f.create_dataset("alignment_times", data=np.concatenate((f1['alignment_times'][:], f2['alignment_times'][:]), axis=0))
            f.create_dataset("alignment_vals", data=np.concatenate((f1['alignment_vals'][:], f2['alignment_vals'][:]), axis=0))
            dset_before = f.create_group("numerics_before")
            dset_after = f.create_group("numerics_after")
            for k in NUMERIC_COLUMNS:
                dset_k = dset_before.create_group(k)
                dset_k.create_dataset(f"vals", data=np.concatenate((f1["numerics_before"][k]["vals"][:], f2["numerics_before"][k]["vals"][:]), axis=0))
                dset_k.create_dataset(f"times", data=np.concatenate((f1["numerics_before"][k]["times"][:], f2["numerics_before"][k]["times"][:]), axis=0))
                dset_k = dset_after.create_group(k)
                dset_k.create_dataset(f"vals", data=np.concatenate((f1["numerics_after"][k]["vals"][:], f2["numerics_after"][k]["vals"][:]), axis=0))
                dset_k.create_dataset(f"times", data=np.concatenate((f1["numerics_after"][k]["times"][:], f2["numerics_after"][k]["times"][:]), axis=0))

            dset = f.create_group("waveforms")
            for k in WAVEFORM_COLUMNS:
                dset_k = dset.create_group(k)
                dset_k.create_dataset("waveforms", data=np.concatenate((f1["waveforms"][k]["waveforms"][:], f2["waveforms"][k]["waveforms"][:]), axis=0))
                dset_k.create_dataset("qualities", data=np.concatenate((f1["waveforms"][k]["qualities"][:], f2["waveforms"][k]["qualities"][:]), axis=0))


In [None]:
with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.90min.60sec.h5", "r") as f:
    print(f.keys())
    print(f['alignment_times'].shape)
    print(f['numerics_after'].keys())
    print(f['numerics_before'].keys())
    print(f['numerics_after']['HR']['vals'].shape)
    print(f['numerics_before']['HR']['vals'].shape)
    print(f['waveforms'].keys())
    print(f['waveforms']['II'].keys())
    print(f['waveforms']['II']['waveforms'].shape)

## 15 min / 120 min - (60 sec waveforms)

### Rejoin Output

In [None]:
df_downstream_1 = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.120min.60sec.1.csv")
print(df_downstream_1.shape)
df_downstream_1.head(2)

In [None]:
df_downstream_2 = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.120min.60sec.2.csv")
print(df_downstream_2.shape)
df_downstream_2.head(2)

In [None]:
df_downstream = pd.concat([df_downstream_1, df_downstream_2])
print(df_downstream.shape)

In [None]:
df_downstream.to_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.120min.60sec.csv", index=False)

In [None]:
TYPE_II = "II"
TYPE_PLETH = "Pleth"
WAVEFORM_COLUMNS = [TYPE_II, TYPE_PLETH]
NUMERIC_COLUMNS = ['HR', 'RR', 'SpO2', 'btbRRInt_ms', 'NBPs', 'NBPd', 'Perf']

with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.120min.60sec.1.h5", "r") as f1:
    with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.120min.60sec.2.h5", "r") as f2:
        with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.120min.60sec.h5", "w") as f:
            f.create_dataset("alignment_times", data=np.concatenate((f1['alignment_times'][:], f2['alignment_times'][:]), axis=0))
            f.create_dataset("alignment_vals", data=np.concatenate((f1['alignment_vals'][:], f2['alignment_vals'][:]), axis=0))
            dset_before = f.create_group("numerics_before")
            dset_after = f.create_group("numerics_after")
            for k in NUMERIC_COLUMNS:
                dset_k = dset_before.create_group(k)
                dset_k.create_dataset(f"vals", data=np.concatenate((f1["numerics_before"][k]["vals"][:], f2["numerics_before"][k]["vals"][:]), axis=0))
                dset_k.create_dataset(f"times", data=np.concatenate((f1["numerics_before"][k]["times"][:], f2["numerics_before"][k]["times"][:]), axis=0))
                dset_k = dset_after.create_group(k)
                dset_k.create_dataset(f"vals", data=np.concatenate((f1["numerics_after"][k]["vals"][:], f2["numerics_after"][k]["vals"][:]), axis=0))
                dset_k.create_dataset(f"times", data=np.concatenate((f1["numerics_after"][k]["times"][:], f2["numerics_after"][k]["times"][:]), axis=0))

            dset = f.create_group("waveforms")
            for k in WAVEFORM_COLUMNS:
                dset_k = dset.create_group(k)
                dset_k.create_dataset("waveforms", data=np.concatenate((f1["waveforms"][k]["waveforms"][:], f2["waveforms"][k]["waveforms"][:]), axis=0))
                dset_k.create_dataset("qualities", data=np.concatenate((f1["waveforms"][k]["qualities"][:], f2["waveforms"][k]["qualities"][:]), axis=0))


In [None]:
with h5py.File("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.120min.60sec.h5", "r") as f:
    print(f.keys())
    print(f['alignment_times'].shape)
    print(f['numerics_after'].keys())
    print(f['numerics_before'].keys())
    print(f['numerics_after']['HR']['vals'].shape)
    print(f['numerics_before']['HR']['vals'].shape)
    print(f['waveforms'].keys())
    print(f['waveforms']['II'].keys())
    print(f['waveforms']['II']['waveforms'].shape)

## Filter Dataset According to Normal Criteria

"you should be able to get the same cohorts from visits_ssl_2020_08_01_2021_12_31.csv (in the Box folder, ask Tom if you can’t see it) as follows. These variables are all in that file, which I will update soon for the timestamps Tom provided. The basic idea is to admit into each cohort only visits for which the VS in question is normal in triage and in the 1st 15 minutes of monitoring."



In [None]:
df = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/visits_ssl_2022_05_23.csv")
print(df.shape)
df.head(3)

In [None]:

def get_numerics_averaged_by_minute(vals, times, numerics_min=15):
    output = np.zeros(numerics_min)
    output[:] = np.NaN

    start = min(times)
    end = max(times)

    temp_min = 0
    temp_list = []
    valid_vals = 0
    for idx in range(len(vals)):
        if times[idx] == 0:
            continue
        if temp_min >= numerics_min:
            break
        if (start + temp_min * 60) <= times[idx] < (start + (temp_min + 1) * 60):
            temp_list.append(vals[idx])
        else:
            output[temp_min] = np.mean(temp_list) if len(temp_list) > 0 else float("nan")
            if len(temp_list) > 0:
                valid_vals += 1
            temp_list = [vals[idx]]
            temp_min += 1

    if len(temp_list) > 0 and temp_min < numerics_min:
        output[temp_min] = np.mean(temp_list)

    return output


### Hypotension

Valid CSNs should be any whose triage MAP values >= 65 and any initial monitoring MAP values up to the alignment time >= 65

- Hypotension
    - Cohort filter: Triage_MAP>=65 & First_mon_MAP>=65
    - Case: Mon_to_first_low_MAP in 15:75/105/135 (depending on window chosen)

In [None]:
# We only want to include those who have non-decompensated triage MAP values
df_hypo = df[(df["Triage_MAP"] >= 65)]
df_hypo.shape

In [None]:
triage_csns = set(df_hypo["CSN"].tolist())
len(triage_csns)

In [None]:
df_summary = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.csv")

def f(row):
    ts = datetime.fromtimestamp(row["alignment_time"])
    tz = pytz.timezone("America/Vancouver")
    aware_dt = tz.localize(ts)
    return aware_dt.isoformat()

df_summary['alignment_dt'] = df_summary.apply(f, axis=1)
df_summary = df_summary[df_summary["patient_id"].isin(triage_csns)]
print(df_summary.shape)
df_summary.head(5)

In [None]:
# Among the remaining patients, find out if decompensation occurred before the alignment time
df_hypo = df[~df["Time_of_first_low_MAP"].isna()]
print(df_hypo.shape)

In [None]:
df_hypo = df_hypo[["CSN", "Time_of_first_low_MAP"]]
df_hypo.head(1)

In [None]:
def f(row):
    ts = datetime.strptime(row["Time_of_first_low_MAP"], "%Y-%m-%dT%H:%M:%SZ")
    tz = pytz.timezone("America/Vancouver")
    aware_dt = tz.localize(ts)
    return aware_dt.isoformat()

df_hypo["Time_of_first_low_MAP"] = df_hypo.apply(f, axis=1)
df_hypo.head(1)

In [None]:
csn_to_first_decomp_val = {}
for i, row in df_hypo.iterrows():
    csn_to_first_decomp_val[row["CSN"]] = row["Time_of_first_low_MAP"]

In [None]:

def f(row):
    csn = row["patient_id"]
    if csn in csn_to_first_decomp_val:
        if row["alignment_dt"] < csn_to_first_decomp_val[csn]:
            # Decompensation occurred after the alignment - this is ok!
            return False
        else:
            # Decompensated before alignment
            return True
    else:
        # No decompensation occurred at all
        return False

df_summary['decompensated_before_alignment'] = df_summary.apply(f, axis=1)
df_summary = df_summary[df_summary['decompensated_before_alignment'] == False]
print(df_summary.shape)
df_summary.head(5)


In [None]:
df_summary.to_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.initial_map_normal.csv", index=False)

### Hypoxia

Valid CSNs should be any whose triage SpO2 values >= 90 and any initial monitoring SpO2 values up to the alignment time >= 90

- Hypoxia
    - Cohort filter: Triage_SpO2>=90 & First_mon_SpO2>=90
    - Case: Mon_to_first_low_SpO2 in 15:75/105/135

In [None]:
# We only want to include those who have non-decompensated triage MAP values
df_hypo = df[(df["Triage_SpO2"] >= 90)]
df_hypo.shape

In [None]:
triage_csns = set(df_hypo["CSN"].tolist())
len(triage_csns)

In [None]:
df_summary = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.csv")

def f(row):
    ts = datetime.fromtimestamp(row["alignment_time"])
    tz = pytz.timezone("America/Vancouver")
    aware_dt = tz.localize(ts)
    return aware_dt.isoformat()

df_summary['alignment_dt'] = df_summary.apply(f, axis=1)
df_summary = df_summary[df_summary["patient_id"].isin(triage_csns)]
print(df_summary.shape)
df_summary.head(3)

In [None]:
# Among the remaining patients, find out if decompensation occurred before the alignment time
df_hypo = df[~df["Time_of_first_low_SpO2"].isna()]
print(df_hypo.shape)

In [None]:
def f(row):
    ts = datetime.strptime(row["Time_of_first_low_SpO2"], "%Y-%m-%dT%H:%M:%SZ")
    tz = pytz.timezone("America/Vancouver")
    aware_dt = tz.localize(ts)
    return aware_dt.isoformat()

df_hypo["Time_of_first_low_SpO2"] = df_hypo.apply(f, axis=1)
df_hypo.head(1)

In [None]:
csn_to_first_decomp_val = {}
for i, row in df_hypo.iterrows():
    csn_to_first_decomp_val[row["CSN"]] = row["Time_of_first_low_SpO2"]

In [None]:

def f(row):
    csn = row["patient_id"]
    if csn in csn_to_first_decomp_val:
        if row["alignment_dt"] < csn_to_first_decomp_val[csn]:
            # Decompensation occurred after the alignment - this is ok!
            return False
        else:
            # Decompensated before alignment
            return True
    else:
        # No decompensation occurred at all
        return False

df_summary['decompensated_before_alignment'] = df_summary.apply(f, axis=1)
df_summary = df_summary[df_summary['decompensated_before_alignment'] == False]
print(df_summary.shape)
df_summary.head(5)


In [None]:
df_summary.to_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.initial_spo2_normal.csv", index=False)

### Tachycardia

Valid CSNs should be any whose triage HR values <= 110 and any initial monitoring HR values up to the alignment time <= 110

- Tachycardia
    - Cohort filter: Triage_HR<=110 & First_mon_HR<=110
    - Case: Mon_to_first_high_HR in 15:75/105/135

In [None]:
# We only want to include those who have non-decompensated triage MAP values
df_hypo = df[(df["Triage_HR"] <= 110)]
df_hypo.shape

In [None]:
triage_csns = set(df_hypo["CSN"].tolist())
len(triage_csns)

In [None]:
df_summary = pd.read_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.csv")

def f(row):
    ts = datetime.fromtimestamp(row["alignment_time"])
    tz = pytz.timezone("America/Vancouver")
    aware_dt = tz.localize(ts)
    return aware_dt.isoformat()

df_summary['alignment_dt'] = df_summary.apply(f, axis=1)
df_summary = df_summary[df_summary["patient_id"].isin(triage_csns)]
print(df_summary.shape)
df_summary.head(3)

In [None]:
# Among the remaining patients, find out if decompensation occurred before the alignment time
df_hypo = df[~df["Time_of_first_high_HR"].isna()]
print(df_hypo.shape)

In [None]:
df_hypo = df_hypo[["CSN", "Time_of_first_high_HR"]]
df_hypo.head(1)

In [None]:
def f(row):
    ts = datetime.strptime(row["Time_of_first_high_HR"], "%Y-%m-%dT%H:%M:%SZ")
    tz = pytz.timezone("America/Vancouver")
    aware_dt = tz.localize(ts)
    return aware_dt.isoformat()

df_hypo["Time_of_first_high_HR"] = df_hypo.apply(f, axis=1)
df_hypo.head(1)

In [None]:
csn_to_first_decomp_val = {}
for i, row in df_hypo.iterrows():
    csn_to_first_decomp_val[row["CSN"]] = row["Time_of_first_high_HR"]

In [None]:

def f(row):
    csn = row["patient_id"]
    if csn in csn_to_first_decomp_val:
        if row["alignment_dt"] < csn_to_first_decomp_val[csn]:
            # Decompensation occurred after the alignment - this is ok!
            return False
        else:
            # Decompensated before alignment
            return True
    else:
        # No decompensation occurred at all
        return False

df_summary['decompensated_before_alignment'] = df_summary.apply(f, axis=1)
df_summary = df_summary[df_summary['decompensated_before_alignment'] == False]
print(df_summary.shape)
df_summary.head(5)


In [None]:
df_summary.to_csv("/deep/group/ed-monitor-self-supervised/v4/downstream.15min.60min.60sec.initial_hr_normal.csv", index=False)