In [5]:
import os
import numpy as np
import pandas as pd
from ehr_ml.clmbr import PatientTimelineDataset
from ehr_ml.clmbr.dataset import DataLoader
from ehr_ml.timeline import TimelineReader
from ehr_ml.clmbr import convert_patient_data
import google.auth
import torch
import torch.nn as nn

In [2]:
credentials, project_id = google.auth.default()
rs_dataset_project = 'som-nero-nigam-starr'
rs_dataset = 'jlemmon_explore'
rs_table = 'clmbr_admission_rollup'
clmbr_model_path = '/local-scratch/nigam/projects/jlemmon/cl-clmbr/experiments/main/artifacts/models/clmbr/pretrained/models/gru_sz_800_do_0.1_cd_0_dd_0_lr_0.001_l2_0.01'
cohort_fpath = '/local-scratch/nigam/projects/jlemmon/cl-clmbr/experiments/main/data/'
extract_path = os.path.join('/local-scratch/nigam/projects/jlemmon/cl-clmbr/experiments/main/data/extracts/20210723', "extract.db")
labelled_fpath = '/local-scratch/nigam/projects/jlemmon/cl-clmbr/experiments/main/data/labelled_data/hospital_mortality/pretrained/gru_sz_800_do_0.1_cd_0_dd_0_lr_0.001_l2_0.01'



In [3]:
query = f'SELECT * FROM {rs_dataset_project}.{rs_dataset}.{rs_table}'

adm_df = pd.read_gbq(query, project_id=rs_dataset_project, dialect='standard', credentials=credentials)

In [4]:
print(adm_df)

        person_id          admit_date      discharge_date
0        29923082 2018-10-16 13:30:00 2018-10-24 13:08:00
1        29923083 2018-07-22 20:22:00 2018-07-26 14:45:00
2        29923090 2018-02-04 08:05:00 2018-02-09 11:27:00
3        29923110 2015-06-27 17:03:00 2015-06-30 12:20:00
4        29923110 2020-10-27 09:06:00 2020-10-31 14:55:00
...           ...                 ...                 ...
472145   69241533 2021-06-23 18:10:00 2021-06-25 15:51:00
472146   69241534 2021-06-16 11:30:00 2021-06-19 12:23:00
472147   69241537 2021-07-02 16:57:00 2021-07-04 14:31:00
472148   69241546 2021-08-04 05:52:00 2021-08-06 13:09:00
472149   69241548 2021-08-08 12:57:00 2021-08-09 11:28:00

[472150 rows x 3 columns]


In [5]:
filter_df = adm_df[adm_df.groupby('person_id').person_id.transform('count') >1][:10]
print(filter_df)

    person_id          admit_date      discharge_date
3    29923110 2015-06-27 17:03:00 2015-06-30 12:20:00
4    29923110 2020-10-27 09:06:00 2020-10-31 14:55:00
5    29923118 2019-01-07 10:36:00 2019-01-22 17:45:00
6    29923118 2019-02-08 23:17:00 2019-02-17 15:46:00
8    29923122 2019-06-17 17:38:00 2019-06-17 23:59:00
9    29923122 2019-06-19 06:15:00 2019-06-23 12:30:00
10   29923142 2019-01-13 17:38:00 2019-01-16 16:08:00
11   29923142 2019-02-23 06:46:00 2019-02-23 08:36:00
20   29923194 2018-11-20 09:31:00 2018-11-22 17:42:00
21   29923194 2020-09-02 14:11:00 2020-09-03 17:50:00


In [None]:
val_pids = list(filter_df['person_id'].unique())
ehr_val_pids, ehr_val_days = convert_patient_data(extract_path, filter_df['person_id'], filter_df['admit_date'])
print(ehr_val_pids, ehr_val_days)
#print(val_pids)

In [None]:
ocp_ids = pd.DataFrame({'pid':list(filter_df['person_id']),'ehr_id':ehr_val_pids, 'admit_date':list(filter_df['admit_date']), 'discharge_date':list(filter_df['discharge_date']),'day_idx':ehr_val_days})
print(ocp_ids)                 

In [54]:
def load_data():
    """
    Load datasets from split csv files.
    """

    data_path = f'{labelled_fpath}'


    train_pids = pd.read_csv(f'{data_path}/ehr_ml_patient_ids_train.csv')
    # 	val_pids = pd.read_csv(f'{data_path}/ehr_ml_patient_ids_val.csv')

    train_days = pd.read_csv(f'{data_path}/day_indices_train.csv')
    # 	val_days = pd.read_csv(f'{data_path}/day_indices_val.csv')

    train_labels = pd.read_csv(f'{data_path}/labels_train.csv')
    # 	val_labels = pd.read_csv(f'{data_path}/labels_val.csv')

    train_data = (train_labels.to_numpy().flatten(),train_pids.to_numpy().flatten(),train_days.to_numpy().flatten())
    # 	val_data = (val_labels.to_numpy().flatten(),val_pids.to_numpy().flatten(),val_days.to_numpy().flatten())
    df = pd.DataFrame({'pids':train_data[1], 'days':train_data[2]})

    return df, train_pids
    
def get_windows(adm_df, pid):
    admissions = adm_df.query('person_id == @pid')
    admissions.reset_index(inplace=True)
    num_windows = int(len(admissions)/2)
    window_pairs = []
    idx = 0
    for i in range(num_windows):
        w_1 = admissions.iloc[idx]
        w_2 = admissions.iloc[idx+1]
        window_pairs.append([(w_1['person_id'],w_1['admit_date'],w_1['discharge_date']),(w_2['person_id'],w_2['admit_date'],w_2['discharge_date'])])
        print('dsaasdsa')
        print(window_pairs)
        print('dsasadas')
        
        idx+=2
    return window_pairs

def get_batch(window_pair):
    tlr = TimelineReader(extract_path)
    
    batch = []
    for wp in window_pair:
        print(wp[0])
        print(wp[1])
        p1 = tlr.get_patient(wp[0][0],wp[0][1],wp[0][2])
        p2 = tlr.get_patient(wp[1][0],wp[1][1],wp[1][2])
        rint = np.random.randint(1,2)

        print(wp[0][2]-wp[0][1])
        
#         if rint == 1:
#             days = list(p1.days) + list(p2.days)
#         else:
#             days = list(p2.days) + list(p1.days)
#         print(days)
    
    return batch
        
        
        
        

In [None]:
pid_df, t_pids = load_data()
print(pid_df)
# print(val_pids)
for pid in val_pids:
    print(pid)
    if pid in t_pids:
        print(pid)
pid_df = pid_df[pid_df['pids'].isin(val_pids)]
print(pid_df)
pid = pid_df['pids'][0]
day_idx = pid_df['days'][0]
train_data = (np.array(pid), np.array([0]), np.array(day_idx))
print(train_data)
train_dataset = PatientTimelineDataset(extract_path + '/extract.db', 
                                         extract_path + '/ontology.db', 
                                         f'{clmbr_model_path}/info.json', 
                                         train_data, 
                                         train_data )
# pid = np.random.choice(pids,size=1)[0]
# wp = get_windows(filter_df,pid)
# batch = get_batch(wp)
# with Dataloader(batch, model.config['num_first'], is_val=False, batch_size=9999999, device=args.device) as train_loader:

In [21]:
pairs = [torch.tensor([[1,1,1,1,1,1],[2,2,2,2,2,2],[3,3,3,3,3,3]]),torch.tensor([[1,1,1,1,1,1]])]
pairs = nn.utils.rnn.pad_sequence(pairs, batch_first=True)
print(pairs)
print(pairs.shape)
packed_sequence = nn.utils.rnn.pack_sequence(pairs, enforce_sorted=False)
print(packed_sequence)
print(packed_sequence[0].shape)

tensor([[[1, 1, 1, 1, 1, 1],
         [2, 2, 2, 2, 2, 2],
         [3, 3, 3, 3, 3, 3]],

        [[1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0]]])
torch.Size([2, 3, 6])


TypeError: pad_sequence(): argument 'sequences' (position 1) must be tuple of Tensors, not Tensor