In [1]:
import pandas as pd
import time
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import missingno as msno

# 데이터 정제 시 필요한 모듈들
import src.subjectlist_alignment.pairing as pairing   # 삽관 발관 페어링 관련 모듈
import src.utils.utils as util   # 기타 유틸리티 모듈


### 1. 데이터 준비

In [48]:
# 데이터 불러오기
adults_icu = pd.read_csv('./outputs/adults_icu.csv')
intubation_extubation = pd.read_csv('./outputs/intubation_extubation.csv')

# 시간변수 변환
time_cols = ['intubationtime', 'extubationtime', 'admittime', 'dischtime', 'deathtime']
intubation_extubation = util.to_datetime(intubation_extubation, time_cols)

print(f'adults_icu: {adults_icu.shape}')
print(f'intubation_extubation: {intubation_extubation.shape}')


adults_icu: (73181, 13)
intubation_extubation: (10992, 14)


In [49]:
intubation_extubation

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime
0,10001884,26184834,37510196,2131-01-07 20:39:00,2131-01-11 04:30:00,224385,65.0,37510196.0,2131-01-12 17:40:00,227194.0,65.0,Planned Extubation,2131-01-20 05:15:00,2131-01-20 05:15:00
1,10003400,23559586,38383343,2137-08-04 00:07:00,2137-08-17 21:21:00,224385,90.5,38383343.0,2137-08-21 15:40:00,227194.0,99.6,Planned Extubation,2137-09-02 17:05:00,2137-09-02 17:05:00
2,10004401,27939719,31202136,2144-04-11 03:31:00,2144-04-11 05:03:00,224385,120.0,,NaT,,,,2144-04-13 17:31:00,NaT
3,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-27 19:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,NaT
4,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-30 13:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10987,19997367,20617667,35616526,2126-04-20 07:15:00,2126-05-05 18:20:00,224385,59.0,35616526.0,2126-05-02 16:30:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,NaT
10988,19997367,20617667,35616526,2126-04-20 07:15:00,2126-05-05 18:20:00,224385,59.0,35616526.0,2126-05-08 17:29:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,NaT
10989,19999068,21606769,30143796,2161-08-24 04:10:00,2161-08-25 15:34:00,224385,55.8,30143796.0,2161-08-28 13:35:00,227194.0,55.8,Planned Extubation,2161-09-02 19:00:00,NaT
10990,19999442,26785317,32336619,2148-11-19 10:00:00,2148-11-19 19:00:00,224385,107.5,32336619.0,2148-11-20 14:15:00,227194.0,107.5,Planned Extubation,2148-12-04 16:25:00,NaT


### 2. intubation, extubation 데이터 페어링

In [50]:
def pair_data(df):
    # subject_id와 hadm_id로 그룹화
    grouped_df = df.groupby(['subject_id', 'hadm_id'])

    single_row_list = []
    reformatted_paired_dataframe_list = []

    # 데이터 1차 정리: single row processing, multirow pairing & reformatting
    for (subject_id, hadm_id), group in tqdm(grouped_df, desc="\tProcessing Groups"):

        # single row processing
        # 해당 그룹(hadm_id)에 intubation 이벤트가 하나만 있는 경우 (즉, 페어링이 필요 없는 경우)
        if group['intubationtime'].count() == 1:
            single_row_list.append(group)   # bypass single row event

        # 해당 그룹(hadm_id) 안에 여러 개의 intubation 이벤트가 있는 경우 (즉, 페어링이 필요한 경우)
        elif group['intubationtime'].count() > 1:
            # (subject_id, hadm_id) 그룹 내에서 고유한 intubationtime, extubationtime 값 추출
            unique_intubations = list(group['intubationtime'].dropna().unique())
            unique_extubations = list(group['extubationtime'].dropna().unique())

            pairs = pairing.main_pairing(unique_intubations, unique_extubations)   # 페어링 함수 적용

            # 페어링 완료 후 데이터 재구조화 (필요한 칼럼 가져오기)
            reformatted_paired_dataframe = pairing.reformat_paired_data_to_dataframe(group, pairs, subject_id, hadm_id)
            reformatted_paired_dataframe_list.append(reformatted_paired_dataframe)

    # 데이터프레임 결합하기
    combined_list = single_row_list + reformatted_paired_dataframe_list
    combined_df = pd.concat(combined_list, ignore_index=True)

    # 'subject_id', 'hadm_id', 'imputationtime' 순으로 정렬 (ascending=True)
    combined_df = combined_df.sort_values(by=['subject_id', 'hadm_id', 'intubationtime', 'extubationtime'], ascending=True)
    combined_df = combined_df.reset_index(drop=True)

    return combined_df

In [51]:
paired_df = pair_data(intubation_extubation)


	Processing Groups:   0%|          | 0/7087 [00:00<?, ?it/s]

	Processing Groups: 100%|██████████| 7087/7087 [00:03<00:00, 1830.31it/s]
  combined_df = pd.concat(combined_list, ignore_index=True)
  combined_df = pd.concat(combined_list, ignore_index=True)


In [52]:
paired_df

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime,marker
0,10001884,26184834,37510196.0,2131-01-07 20:39:00,2131-01-11 04:30:00,224385.0,65.0,37510196.0,2131-01-12 17:40:00,227194.0,65.0,Planned Extubation,2131-01-20 05:15:00,2131-01-20 05:15:00,
1,10003400,23559586,38383343.0,2137-08-04 00:07:00,2137-08-17 21:21:00,224385.0,90.5,38383343.0,2137-08-21 15:40:00,227194.0,99.6,Planned Extubation,2137-09-02 17:05:00,2137-09-02 17:05:00,
2,10004401,27939719,31202136.0,2144-04-11 03:31:00,2144-04-11 05:03:00,224385.0,120.0,,NaT,,,,2144-04-13 17:31:00,NaT,
3,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-27 19:00:00,224385.0,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,NaT,
4,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-30 13:00:00,224385.0,76.0,,NaT,,,,2144-02-06 11:45:00,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9391,19997367,20617667,35616526.0,2126-04-20 07:15:00,2126-05-05 18:20:00,224385.0,59.0,35616526.0,2126-05-08 17:29:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,NaT,
9392,19997367,20617667,,2126-04-20 07:15:00,NaT,,,35616526.0,2126-04-22 08:23:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,NaT,
9393,19999068,21606769,30143796.0,2161-08-24 04:10:00,2161-08-25 15:34:00,224385.0,55.8,30143796.0,2161-08-28 13:35:00,227194.0,55.8,Planned Extubation,2161-09-02 19:00:00,NaT,
9394,19999442,26785317,32336619.0,2148-11-19 10:00:00,2148-11-19 19:00:00,224385.0,107.5,32336619.0,2148-11-20 14:15:00,227194.0,107.5,Planned Extubation,2148-12-04 16:25:00,NaT,


In [54]:
paired_df[paired_df.subject_id == 11570331]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime,marker
1473,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-28 05:59:00,224385.0,92.4,30916265.0,2134-12-28 18:36:00,225477.0,92.4,Unplanned Extubation (non-patient initiated),2135-01-06 10:38:00,2135-01-06 10:38:00,
1474,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-29 09:00:00,224385.0,92.4,30916265.0,2134-12-29 14:57:00,227194.0,92.4,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,
1475,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-29 15:02:00,224385.0,92.4,,NaT,,,,2135-01-06 10:38:00,2135-01-06 10:38:00,
1476,11570331,26919892,30916265.0,2134-12-28 04:51:00,2135-01-03 17:15:00,224385.0,92.4,30916265.0,2135-01-04 08:54:00,225468.0,92.4,Unplanned Extubation (patient-initiated),2135-01-06 10:38:00,2135-01-06 10:38:00,


In [53]:
paired_df.to_csv('./outputs/paired_df_test240126.csv')