# 목표
1. 중복 이벤트 삭제 함수 만들기
2. Replacement 함수 만들기
3. Reintubation 함수 만들기

## 1. 삽관 및 발관 중복 이벤트 삭제하기

### Workflow 1-1
* 기존에 있는 intubation1, extubation1 dataframe으로 함수 작성

* 중복 제거 후, time_diff를 구하기 위해 dtype 변경

* loc를 원활히 활용하기 위해 index 초기화 진행

* 첫 번째 행 (0번째 행)은 겹칠 일이 없기에 not_duplicated로 지정

* for문을 활용하여 index를 순차적 조회
    * prev_row, current_row의 subject_id, hadm_id가 같다면 같은 환자 및 시기로 판단
    * time_difference = int_time or ext_time의 current_row - prev_row
    * time_difference <= 1 이라면, duplicate로 지정
        * 추후 확인을 위해 time_difference <= 1 인 row와 prev_row 두개 duplicate로 지정

In [1]:
import time
import pandas as pd
intubation1 = pd.read_csv('./Data/intubation1.csv')
extubation1 = pd.read_csv('./Data/extubation1.csv')

In [4]:
def time_duplicate_event(Original_dataframe, time_col):
    
    # DataFrame 중복 제거
    origin_df = Original_dataframe.drop_duplicates()

    # time delta를 구하기 위해 dtype 변경
    origin_df.loc[:,time_col] = pd.to_datetime(origin_df[time_col])

    # subject_id, hadm_id, int_time(or ext_time) 순으로 정렬
    sorted_df = origin_df.sort_values(by=["subject_id", "hadm_id", time_col], ascending=True)

    # index 초기화 (loc 함수를 원활히 사용하기 위함)
    sorted_df = sorted_df.reset_index(drop=True)
    
    length_df = len(sorted_df)

    # 첫 번째 행 (0번째 행)은 겹칠일이 없기에 not_duplicated 지정
    sorted_df.loc[0, "dup"] = "not_duplicated"

    # dataframe의 길이를 가지고 for문 시작.
    for idx in range(1, (length_df)):
        
        # 현재 행과 이전 행의 subject_id가 같고 hadm_id가 같다는 조건을 동시에 만족한다면
        # 이는 같은 환자로 인정
        if (sorted_df.loc[idx, "subject_id"] == sorted_df.loc[idx-1, "subject_id"]) & \
        (sorted_df.loc[idx, "hadm_id"] == sorted_df.loc[idx-1, "hadm_id"]):
            time_diff = sorted_df.loc[idx, time_col] - sorted_df.loc[idx-1, time_col]

            # time_diff가 1시간 이내라면 duplicate, 아니라면 not_duplicate
            if time_diff <= pd.Timedelta(hours=1):
                sorted_df.loc[idx, "dup"] = "duplicated"
                sorted_df.loc[idx-1, "dup"] = "duplicated"
            else:
                sorted_df.loc[idx, "dup"] = "not_duplicated"
                
        # 현재 행과 이전 행의 subject_id, hadm_id가 다르면 not_duplicate
        else:
            sorted_df.loc[idx, "dup"] = "not_duplicated"

    return sorted_df

In [5]:
intubation1_exp = intubation1.copy()
extubation1_exp = extubation1.copy()

In [6]:
start_time = time.time()

filtered_intubation = time_duplicate_event(intubation1_exp, "intubationtime")
filtered_extubation = time_duplicate_event(extubation1_exp, "extubationtime")

print(f"intubation_shape = {filtered_intubation[filtered_intubation['dup'] == 'duplicated'].shape}")
print(f"extubation_shape = {filtered_extubation[filtered_extubation['dup'] == 'duplicated'].shape}")

end_time = time.time()
wait_time = end_time - start_time

print("----")
print(f"경과 시간 : {round(wait_time, 2)} sec")


intubation_shape = (18, 7)
extubation_shape = (36, 8)
----
경과 시간 : 2.27 sec


In [13]:
filtered_intubation[filtered_intubation["dup"] == "duplicated"]

Unnamed: 0,subject_id,hadm_id,stay_id,intubationtime,itemid,patientweight,dup
18,10021927,24623461,34575919,2180-09-20 15:43:00,224385,56.6,duplicated
19,10021927,24623461,34575919,2180-09-20 16:00:00,224385,56.6,duplicated
54,10089244,29469323,33563887,2128-02-11 12:22:00,224385,90.0,duplicated
55,10089244,29469323,33563887,2128-02-11 12:30:00,224385,90.0,duplicated
63,10098215,22394571,36018186,2118-06-03 20:00:00,224385,55.4,duplicated
...,...,...,...,...,...,...,...
8085,19592126,23077014,30077165,2121-11-29 05:30:00,224385,102.4,duplicated
8229,19747328,29300235,38984649,2154-01-05 12:17:00,224385,103.9,duplicated
8230,19747328,29300235,38984649,2154-01-05 12:24:00,224385,103.9,duplicated
8415,19969918,29544887,31213286,2186-01-16 20:29:00,224385,68.0,duplicated


In [14]:
filtered_extubation[filtered_extubation["dup"] == "duplicated"]

Unnamed: 0,subject_id,hadm_id,stay_id,extubationtime,itemid,patientweight,dup
48,10023486,25262533,31361200,2151-06-08 18:00:00,227194,139.9,duplicated
49,10023486,25262533,31361200,2151-06-08 18:03:00,227194,139.9,duplicated
59,10027602,28166872,32391858,2201-11-07 13:40:00,227194,64.0,duplicated
60,10027602,28166872,32391858,2201-11-07 13:53:00,227194,64.0,duplicated
79,10038933,25129047,32166508,2148-09-11 14:00:00,227194,123.0,duplicated
...,...,...,...,...,...,...,...
22577,19807790,27131460,38192208,2148-01-23 09:45:00,225468,52.6,duplicated
22912,19952171,27208415,32980185,2177-02-02 23:27:00,227194,89.0,duplicated
22913,19952171,27208415,32980185,2177-02-02 23:35:00,227194,89.0,duplicated
23025,19997752,29452285,34531437,2128-03-02 18:08:00,227194,86.0,duplicated


In [635]:
filtered_intubation.to_csv('./Data/output/intubation1_dup.csv')
filtered_extubation.to_csv('./Data/output/extubation1_dup.csv')
filtered_intubation[filtered_intubation["dup"] == "duplicated"].to_csv('./Data/output/intubation1_dup_filter.csv')
filtered_extubation[filtered_extubation["dup"] == "duplicated"].to_csv('./Data/output/extubation1_dup_filter.csv')

### Workflow 1-2-1
* 원본 copy후, 안전하게 진행

* timedelta(time_difference)를 구하기 위해 dtype 변경

* time_col1(intubation), time_col2(extubation) 각각 Dataframe의 null 값을 없앤 후 새로운 Dataframe으로 진행

* 개별 데이터 프레임 정렬 진행

* 각 dataframe의 index list를 변수에 저장 (loc 활용 용이하도록)

* 첫 번째 행은 겹칠일이 없기에 not_duplicated 지정

* for문을 활용하여 index를 순차적 조회
    * col1_df (intubationtime이 notnull 처리된 dataframe)에 loc 활용
    * time_difference <= 1이라면, duplicate로 지정
        * 추후 확인을 위해 time_difference <= 1 인 row와 prev_row 두 개 duplicate로 지정정

In [7]:
alignment_df = pd.read_csv('./Data/alignment_df_raw_new.csv', index_col="Unnamed: 0")

In [8]:
alignment_df_exp = alignment_df.copy()

In [9]:
def time_duplicate_event_new(df, time_col1, time_col2, time_delta):
    # 원본을 copy하여 복제한 df로 진행
    origin_df = df.copy()

    # timedelta를 구하기 위해 데이터 타입 변경
    origin_df[time_col1] = pd.to_datetime(origin_df[time_col1])
    origin_df[time_col2] = pd.to_datetime(origin_df[time_col2])

    # intubation, extubation 각각 null 값을 없앤 후 새로운 df로 진행
    col1_df = origin_df[origin_df[time_col1].notnull()]
    col2_df = origin_df[origin_df[time_col2].notnull()]

    # 각각의 df의 순서를 sort
    col1_sorted_df = col1_df.sort_values(by=["subject_id", "hadm_id", time_col1])
    col2_sorted_df = col2_df.sort_values(by=["subject_id", "hadm_id", time_col2])

    # loc에 활용할 index 리스트 반환 
    col1_idx_list = col1_sorted_df.index.to_list()
    col2_idx_list = col2_sorted_df.index.to_list()
    
    
    # 첫 번째 행 (0번째 행)은 겹칠일이 없기에 not_duplicated 지정
    origin_df.loc[0, "int_dup"] = "not_duplicated"
    origin_df.loc[0, "ext_dup"] = "not_duplicated"

    # dataframe의 길이를 가지고 for문 시작.
    for idx in range(1, (len(col1_sorted_df))):

        if (col1_sorted_df.loc[col1_idx_list[idx], "subject_id"] == col1_sorted_df.loc[col1_idx_list[idx-1], "subject_id"]) & (col1_sorted_df.loc[col1_idx_list[idx], "hadm_id"] == col1_sorted_df.loc[col1_idx_list[idx-1], "hadm_id"]):
        
            time_diff = col1_sorted_df.loc[col1_idx_list[idx], time_col1] - col1_sorted_df.loc[col1_idx_list[idx-1], time_col1]
            
            origin_df.loc[col1_idx_list[idx], "int_timediff"] = time_diff
            
            if time_diff <= pd.Timedelta(minutes = time_delta):
                origin_df.loc[col1_idx_list[idx], "int_dup"] = "duplicated"
                origin_df.loc[col1_idx_list[idx-1], "int_dup"] = "duplicated"
                
            else:
                origin_df.loc[col1_idx_list[idx], "int_dup"] = "not_duplicated"
                
        else:
            origin_df.loc[col1_idx_list[idx], "int_dup"] = "not_duplicated"

    for idx in range(1, (len(col2_sorted_df))):

        if (col2_sorted_df.loc[col2_idx_list[idx], "subject_id"] == col2_sorted_df.loc[col2_idx_list[idx-1], "subject_id"]) & (col2_sorted_df.loc[col2_idx_list[idx], "hadm_id"] == col2_sorted_df.loc[col2_idx_list[idx-1], "hadm_id"]):
        
            time_diff = col2_sorted_df.loc[col2_idx_list[idx], time_col2] - col2_sorted_df.loc[col2_idx_list[idx-1], time_col2]
            
            origin_df.loc[col2_idx_list[idx], "ext_timediff"] = time_diff
            
            if time_diff <= pd.Timedelta(minutes = time_delta):
                origin_df.loc[col2_idx_list[idx], "ext_dup"] = "duplicated"
                origin_df.loc[col2_idx_list[idx-1], "ext_dup"] = "duplicated"
                
            else:
                origin_df.loc[col2_idx_list[idx], "ext_dup"] = "not_duplicated"
                
        else:
            origin_df.loc[col2_idx_list[idx], "ext_dup"] = "not_duplicated"    
    
    return origin_df

In [10]:
minutes = [10, 20, 30, 40, 50, 60]

for minute in minutes:
    filter_df = time_duplicate_event_new(alignment_df_exp, "intubationtime", "extubationtime", minute)
    total_int_row = len(filter_df[filter_df['intubationtime'].notnull()])
    total_ext_row = len(filter_df[filter_df['extubationtime'].notnull()])
    portion_dup_int = round((filter_df["int_dup"].value_counts(normalize=True).loc['duplicated']*100), 2)
    portion_dup_ext = round((filter_df["ext_dup"].value_counts(normalize=True).loc['duplicated']*100), 2)
    
    print(f"time_delta가 {minute}분 일때")
    print(f"{'-'*70}")
    print(f'intubation 중복 행 개수 : 전체 intubation 행 개수 {total_int_row}행 중에서, {filter_df["int_dup"].value_counts().loc["duplicated"]}개')
    print(f'intubation 중복 행 비율 : {portion_dup_int} %')
    print(f'extubation 중복 행 개수 : 전체 extubation 행 개수 {total_ext_row}행 중에서, {filter_df["ext_dup"].value_counts().loc["duplicated"]}개')
    print(f'extubation 중복 행 비율 : {portion_dup_ext} %')
    print("")

time_delta가 10분 일때
----------------------------------------------------------------------
intubation 중복 행 개수 : 전체 intubation 행 개수 8445행 중에서, 199개
intubation 중복 행 비율 : 2.04 %
extubation 중복 행 개수 : 전체 extubation 행 개수 7357행 중에서, 193개
extubation 중복 행 비율 : 1.98 %

time_delta가 20분 일때
----------------------------------------------------------------------
intubation 중복 행 개수 : 전체 intubation 행 개수 8445행 중에서, 303개
intubation 중복 행 비율 : 3.11 %
extubation 중복 행 개수 : 전체 extubation 행 개수 7357행 중에서, 241개
extubation 중복 행 비율 : 2.47 %

time_delta가 30분 일때
----------------------------------------------------------------------
intubation 중복 행 개수 : 전체 intubation 행 개수 8445행 중에서, 357개
intubation 중복 행 비율 : 3.66 %
extubation 중복 행 개수 : 전체 extubation 행 개수 7357행 중에서, 259개
extubation 중복 행 비율 : 2.66 %

time_delta가 40분 일때
----------------------------------------------------------------------
intubation 중복 행 개수 : 전체 intubation 행 개수 8445행 중에서, 391개
intubation 중복 행 비율 : 4.01 %
extubation 중복 행 개수 : 전체 extubation 행 개수 7357행 중에서

In [18]:
filter_df

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
0,10001884,26184834,37510196.0,2131-01-07 20:39:00,2131-01-11 04:30:00,224385.0,65.0,37510196.0,2131-01-12 17:40:00,227194.0,...,Planned Extubation,2131-01-20 05:15:00,2131-01-20 05:15:00,,1 days 13:10:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT
1,10003400,23559586,38383343.0,2137-08-04 00:07:00,2137-08-17 21:21:00,224385.0,90.5,38383343.0,2137-08-21 15:40:00,227194.0,...,Planned Extubation,2137-09-02 17:05:00,2137-09-02 17:05:00,,3 days 18:19:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT
2,10004401,27939719,31202136.0,2144-04-11 03:31:00,2144-04-11 05:03:00,224385.0,120.0,,NaT,,...,,2144-04-13 17:31:00,,,,0 days 00:00:00,not_duplicated,,NaT,NaT
3,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-27 19:00:00,224385.0,76.0,32773003.0,2144-01-30 12:30:00,227194.0,...,Planned Extubation,2144-02-06 11:45:00,,,2 days 17:30:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT
4,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-30 13:00:00,224385.0,76.0,,NaT,,...,,2144-02-06 11:45:00,,,,0 days 00:00:00,not_duplicated,,2 days 18:00:00,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9742,19997367,20617667,35616526.0,2126-04-20 07:15:00,2126-05-05 18:20:00,224385.0,59.0,35616526.0,2126-05-08 17:29:00,227194.0,...,Planned Extubation,2126-05-19 14:15:00,,,2 days 23:09:00,,not_duplicated,not_duplicated,8 days 22:06:00,6 days 00:59:00
9743,19997367,20617667,,2126-04-20 07:15:00,NaT,,,35616526.0,2126-04-22 08:23:00,227194.0,...,Planned Extubation,2126-05-19 14:15:00,,,,0 days 00:00:00,,not_duplicated,NaT,NaT
9744,19999068,21606769,30143796.0,2161-08-24 04:10:00,2161-08-25 15:34:00,224385.0,55.8,30143796.0,2161-08-28 13:35:00,227194.0,...,Planned Extubation,2161-09-02 19:00:00,,,2 days 22:01:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT
9745,19999442,26785317,32336619.0,2148-11-19 10:00:00,2148-11-19 19:00:00,224385.0,107.5,32336619.0,2148-11-20 14:15:00,227194.0,...,Planned Extubation,2148-12-04 16:25:00,,,0 days 19:15:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT


### 추가 검증
* admittime이 추가된 데이터로 추가적인 검증 진행

In [14]:
alignment_admittime = pd.read_csv('./Data/alignment_df_raw_with_admittime.csv', index_col="Unnamed: 0")
alignment_admittime

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime
0,10001884,26184834,37510196.0,2131-01-07 20:39:00,2131-01-11 04:30:00,224385.0,65.0,37510196.0,2131-01-12 17:40:00,227194.0,65.0,Planned Extubation,2131-01-20 05:15:00,2131-01-20 05:15:00,,1 days 13:10:00,0 days 00:00:00
1,10003400,23559586,38383343.0,2137-08-04 00:07:00,2137-08-17 21:21:00,224385.0,90.5,38383343.0,2137-08-21 15:40:00,227194.0,99.6,Planned Extubation,2137-09-02 17:05:00,2137-09-02 17:05:00,,3 days 18:19:00,0 days 00:00:00
2,10004401,27939719,31202136.0,2144-04-11 03:31:00,2144-04-11 05:03:00,224385.0,120.0,,,,,,2144-04-13 17:31:00,,,,0 days 00:00:00
3,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-27 19:00:00,224385.0,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,,,2 days 17:30:00,0 days 00:00:00
4,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-30 13:00:00,224385.0,76.0,,,,,,2144-02-06 11:45:00,,,,0 days 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9742,19997367,20617667,35616526.0,2126-04-20 07:15:00,2126-05-05 18:20:00,224385.0,59.0,35616526.0,2126-05-08 17:29:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,,,2 days 23:09:00,
9743,19997367,20617667,,2126-04-20 07:15:00,,,,35616526.0,2126-04-22 08:23:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,,,,0 days 00:00:00
9744,19999068,21606769,30143796.0,2161-08-24 04:10:00,2161-08-25 15:34:00,224385.0,55.8,30143796.0,2161-08-28 13:35:00,227194.0,55.8,Planned Extubation,2161-09-02 19:00:00,,,2 days 22:01:00,0 days 00:00:00
9745,19999442,26785317,32336619.0,2148-11-19 10:00:00,2148-11-19 19:00:00,224385.0,107.5,32336619.0,2148-11-20 14:15:00,227194.0,107.5,Planned Extubation,2148-12-04 16:25:00,,,0 days 19:15:00,0 days 00:00:00


In [15]:
minutes = [10, 20, 30, 40, 50, 60]

for minute in minutes:
    filter_df = time_duplicate_event_new(alignment_admittime, "intubationtime", "extubationtime", minute)
    total_int_row = len(filter_df[filter_df['intubationtime'].notnull()])
    total_ext_row = len(filter_df[filter_df['extubationtime'].notnull()])
    portion_dup_int = round((filter_df["int_dup"].value_counts(normalize=True).loc['duplicated']*100), 2)
    portion_dup_ext = round((filter_df["ext_dup"].value_counts(normalize=True).loc['duplicated']*100), 2)
    
    print(f"time_delta가 {minute}분 일때")
    print(f"{'-'*70}")
    print(f'intubation 중복 행 개수 : 전체 intubation 행 개수 {total_int_row}행 중에서, {filter_df["int_dup"].value_counts().loc["duplicated"]}개')
    print(f'intubation 중복 행 비율 : {portion_dup_int} %')
    print(f'extubation 중복 행 개수 : 전체 extubation 행 개수 {total_ext_row}행 중에서, {filter_df["ext_dup"].value_counts().loc["duplicated"]}개')
    print(f'extubation 중복 행 비율 : {portion_dup_ext} %')
    print("")

time_delta가 10분 일때
----------------------------------------------------------------------
intubation 중복 행 개수 : 전체 intubation 행 개수 8445행 중에서, 199개
intubation 중복 행 비율 : 2.04 %
extubation 중복 행 개수 : 전체 extubation 행 개수 7357행 중에서, 193개
extubation 중복 행 비율 : 1.98 %

time_delta가 20분 일때
----------------------------------------------------------------------
intubation 중복 행 개수 : 전체 intubation 행 개수 8445행 중에서, 303개
intubation 중복 행 비율 : 3.11 %
extubation 중복 행 개수 : 전체 extubation 행 개수 7357행 중에서, 241개
extubation 중복 행 비율 : 2.47 %

time_delta가 30분 일때
----------------------------------------------------------------------
intubation 중복 행 개수 : 전체 intubation 행 개수 8445행 중에서, 357개
intubation 중복 행 비율 : 3.66 %
extubation 중복 행 개수 : 전체 extubation 행 개수 7357행 중에서, 259개
extubation 중복 행 비율 : 2.66 %

time_delta가 40분 일때
----------------------------------------------------------------------
intubation 중복 행 개수 : 전체 intubation 행 개수 8445행 중에서, 391개
intubation 중복 행 비율 : 4.01 %
extubation 중복 행 개수 : 전체 extubation 행 개수 7357행 중에서

In [23]:
filter_df[filter_df["subject_id"] == 10021927]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
20,10021927,24623461,34575919.0,2180-09-20 10:01:00,2180-09-20 15:43:00,224385.0,56.6,,NaT,,...,,2180-09-27 10:32:00,2180-09-27 10:32:00,,,0 days 00:00:00,duplicated,,NaT,NaT
21,10021927,24623461,34575919.0,2180-09-20 10:01:00,2180-09-20 16:00:00,224385.0,56.6,34575919.0,2180-09-27 10:00:00,227194.0,...,Planned Extubation,2180-09-27 10:32:00,2180-09-27 10:32:00,,6 days 18:00:00,0 days 00:00:00,duplicated,not_duplicated,0 days 00:17:00,NaT


In [30]:
filter_df

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
0,10001884,26184834,37510196.0,2131-01-07 20:39:00,2131-01-11 04:30:00,224385.0,65.0,37510196.0,2131-01-12 17:40:00,227194.0,...,Planned Extubation,2131-01-20 05:15:00,2131-01-20 05:15:00,,1 days 13:10:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT
1,10003400,23559586,38383343.0,2137-08-04 00:07:00,2137-08-17 21:21:00,224385.0,90.5,38383343.0,2137-08-21 15:40:00,227194.0,...,Planned Extubation,2137-09-02 17:05:00,2137-09-02 17:05:00,,3 days 18:19:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT
2,10004401,27939719,31202136.0,2144-04-11 03:31:00,2144-04-11 05:03:00,224385.0,120.0,,NaT,,...,,2144-04-13 17:31:00,,,,0 days 00:00:00,not_duplicated,,NaT,NaT
3,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-27 19:00:00,224385.0,76.0,32773003.0,2144-01-30 12:30:00,227194.0,...,Planned Extubation,2144-02-06 11:45:00,,,2 days 17:30:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT
4,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-30 13:00:00,224385.0,76.0,,NaT,,...,,2144-02-06 11:45:00,,,,0 days 00:00:00,not_duplicated,,2 days 18:00:00,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9742,19997367,20617667,35616526.0,2126-04-20 07:15:00,2126-05-05 18:20:00,224385.0,59.0,35616526.0,2126-05-08 17:29:00,227194.0,...,Planned Extubation,2126-05-19 14:15:00,,,2 days 23:09:00,,not_duplicated,not_duplicated,8 days 22:06:00,6 days 00:59:00
9743,19997367,20617667,,2126-04-20 07:15:00,NaT,,,35616526.0,2126-04-22 08:23:00,227194.0,...,Planned Extubation,2126-05-19 14:15:00,,,,0 days 00:00:00,,not_duplicated,NaT,NaT
9744,19999068,21606769,30143796.0,2161-08-24 04:10:00,2161-08-25 15:34:00,224385.0,55.8,30143796.0,2161-08-28 13:35:00,227194.0,...,Planned Extubation,2161-09-02 19:00:00,,,2 days 22:01:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT
9745,19999442,26785317,32336619.0,2148-11-19 10:00:00,2148-11-19 19:00:00,224385.0,107.5,32336619.0,2148-11-20 14:15:00,227194.0,...,Planned Extubation,2148-12-04 16:25:00,,,0 days 19:15:00,0 days 00:00:00,not_duplicated,not_duplicated,NaT,NaT


In [34]:
# filter_df 전체 저장

filter_df.to_csv('./Data/output/alignment_dup_admittime_dataframe.csv', index=False)

----

## Workflow 1-2-2
* time_col1(intubation), time_col2(extubation) 데이터 프레임 각각 할당

* 개별 데이터 프레임 sort_values 진행

* time_diff를 계산하는 함수를 제작, 개별 데이터 프레임에 lambda 함수를 활용하여 적용

* 적용 뒤엔 각각의 데이터 프레임에 time_diff, dup 컬럼이 만들어졌으므로, 이를 원본 df 데이터에 loc 함수를 이용하여 할당

In [277]:
alignment_df = pd.read_csv('./Data/alignment_df_raw_new.csv', index_col="Unnamed: 0")

In [278]:
alignment_df_exp = alignment_df.copy()

In [279]:
import pandas as pd

def time_duplicate_event_new(df, time_col1, time_col2, time_delta_minute):
    # 시간 열을 datetime 형식으로 변환
    df[time_col1] = pd.to_datetime(df[time_col1])
    df[time_col2] = pd.to_datetime(df[time_col2])

    # notnull로 NaN 값이 없는 dataframe을 제작
    col1_df = df[df[time_col1].notnull()]
    col2_df = df[df[time_col2].notnull()]

    col1_sorted_df = col1_df.sort_values(by=["subject_id", "hadm_id", time_col1])
    col2_sorted_df = col2_df.sort_values(by=["subject_id", "hadm_id", time_col2])
    
    # 시간 차이를 기반으로 중복을 표시하는 함수
    def mark_duplicates(group, time_column):
        # 이전 행과의 시간 차이 계산
        group["time_diff"] = group[time_column].diff()

        result_list = ["duplicated" if td <= pd.Timedelta(minutes=time_delta_minute) else "not_duplicated" for td in group["time_diff"]]
        
        group["dup"] = result_list
        
        # 임계값과 시간 차이를 비교하여 중복 표시
        return group

    
    filter_int = col1_sorted_df.groupby(["subject_id", "hadm_id"], group_keys=False).apply(lambda x: mark_duplicates(x, time_col1))
    filter_ext = col2_sorted_df.groupby(["subject_id", "hadm_id"], group_keys=False).apply(lambda x: mark_duplicates(x, time_col2))

    col1_idx_list = filter_int.index.to_list()
    col2_idx_list = filter_ext.index.to_list()

    for idx in range(1, len(col1_idx_list)):
        # print("현재 인덱스:", idx)
        # print("filter_int 인덱스:", col1_idx_list[idx])
        if filter_int["dup"][col1_idx_list[idx]] == "duplicated":
            df.loc[col1_idx_list[idx-1], "int_dup"] = "duplicated"
            df.loc[col1_idx_list[idx], "int_dup"] = "duplicated"
            df.loc[col1_idx_list[idx], "int_timediff"] = filter_int.loc[col1_idx_list[idx], "time_diff"]
            
    for idx in range(1, len(col2_idx_list)):
        if filter_ext["dup"][col2_idx_list[idx]] == "duplicated":
            df.loc[col2_idx_list[idx-1], "ext_dup"] = "duplicated"
            df.loc[col2_idx_list[idx], "ext_dup"] = "duplicated"
            df.loc[col2_idx_list[idx], "ext_timediff"] = filter_ext.loc[col2_idx_list[idx], "time_diff"]

    return df

In [280]:
df_filter = time_duplicate_event_new(alignment_df_exp, "intubationtime", "extubationtime", 60)

In [286]:
df_filter.to_csv('./Data/output/alignment_dup_full.csv', index=False)

## 2. Replacement 함수 만들기

In [11]:
import pandas as pd

df_filter = pd.read_csv("./Data/output/alignment_dup_admittime_dataframe.csv")

In [12]:
df_filter_exp = df_filter.copy()

In [199]:
'''
경우의 수를 나눠서 생각해보는 것
'''

def row_imputation(group):
    
    # group의 행 개수가 1개이고, int_time, ext_time 두 행중 하나가 null 값이 있을 때
    if len(group) == 1 and (group["intubationtime"].isnull() | group["extubationtime"].isnull()):

        # int_time이 null 값이라면, admittime으로 값 대체
        if group["intubationtime"].isnull():
            group.loc[group.index[0],"intubationtime"] = group.loc[group.index[0],"admittime"]

        # ext_time이 null 이라면, deathtime을 우선으로 할당하고, 없다면 dischtime을 사용
        # try, except 구문으로 바꾸는 것이 더 효율적일 것 같음
        elif group["extubationtime"].isnull():
            if group["deathtime"].notnull():
                group.loc[group.index[0], "extubationtime"] = group.loc[group.index[0], "deathtime"]
            elif group["deathtime"].isnull() & group["dischtime"].notnull():
                group.loc[group.index[0], "extubationtime"] = group.loc[group.index[0], "dischtime"]
                
    # group의 행 개수가 1개 이상 이고, int_time에만 null 값이 있을 때
    elif len(group) > 1 and (group["intubationtime"].isnull() and group["extubationtime"].notnull()):

        # extubationtime 값은 모두 있으므로, group을 extubationtime 기준으로 오름차순 정렬
        group.sort_values(by="extubationtime", inplace=True)

        # iloc를 사용
        # for문으로 group을 조회, 만약 sorting했을 때, int_time null 값이 맨 첫 번째라면? admittime으로 할당
        # 아니라면 이전 행의 extubationtime으로 할당
        for index, row in group.iterrows():
            if pd.isnull(row["intubationtime"]):
                iloc_idx = group.index.get_loc(index)
                iloc_col = group.columns.get_loc("intubationtime")
                if iloc_idx == 0:
                    group.iloc[iloc_idx, iloc_col] = group.iloc[iloc_idx, group.columns.get_loc("admittime")]
                elif iloc_idx > 0:
                    group.iloc[iloc_idx, iloc_col] = group.iloc[iloc_idx-1, group.columns.get_loc("extubationtime")]

    # group의 행 개수가 1개 이상이고, ext_time에만 null 값이 있을 때
    elif len(group) > 1 and (group["intubationtime"].notnull() and group["extubationtime"].isnull()):

        # intubationtime 값은 모두 있으므로, group을 intubationtime 기준으로 오름차순 정렬
        group.sort_values(by="intubationtime", inplace=True)

        # iloc 사용
        # for문으로 group을 조회, 만약 sorting했을 때, ext_time null 값이 맨 마지막 행이라면? deathtime 우선 할당, 없다면 ditchtime
        # 만약 맨 마지막 행이 아니라면, 다음 행의 intubationtime으로 할당
        for index, row in group.iterrows():
            if pd.isnull(row["extubationtime"]):
                iloc_idx = group.index.get_loc(index)
                iloc_col = group.columns.get_loc("extubationtime")
                if iloc_idx == len(group)-1:
                    try:
                        group.iloc[iloc_idx, iloc_col] = group.iloc[iloc_idx, group.columns.get_loc("deathtime")]
                    except:
                        group.iloc[iloc_idx, iloc_col] = group.iloc[iloc_idx, group.columns.get_loc("ditchtime")]
                        
                elif iloc_idx < len(group)-1:
                    group.iloc[iloc_idx, iloc_col] = group.iloc[iloc_idx+1, group.columns.get_loc("intubationtime")]

    # group의 행 개수가 1개 이상이고, int_time, ext_time 둘 다 null 값이 있을 때
    
    
        
    return group

### intubationtime 결측치 처리 테스트 (stop)
* sort 진행시, intubationtime, extubationtime 순으로 정렬을 진행했기 때문에, intubationtime이 우선적으로 처리됨
    * 기본적으로 시간순으로 정렬된 것은 intubationtime인 것으로 알 수 있음
* 그렇다는 건, intubationtime 결측치를 먼저 채우고, extubationtime은 intubationtime의 결측치가 채워진다면 자동적으로 채워진다.
* 먼저 int_time 2개이상 결측치를 처리하는 것을 테스트 해보자

In [300]:
test = df_filter_exp[df_filter_exp["subject_id"] == 11570331].copy()
test

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
1533,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-28 05:59:00,224385.0,92.4,,,,...,,2135-01-06 10:38:00,2135-01-06 10:38:00,,,0 days 00:00:00,not_duplicated,,,
1534,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-28 18:36:00,224385.0,92.4,30916265.0,2134-12-28 18:36:00,225477.0,...,Unplanned Extuabtion (non-patient initiated),2135-01-06 10:38:00,2135-01-06 10:38:00,,0 days 00:00:00,0 days 14:24:00,not_duplicated,not_duplicated,0 days 12:37:00,
1535,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-29 09:00:00,224385.0,92.4,30916265.0,2134-12-29 14:47:00,227194.0,...,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,,0 days 05:47:00,0 days 00:15:00,not_duplicated,duplicated,0 days 14:24:00,0 days 20:11:00
1536,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-29 15:02:00,224385.0,92.4,,,,...,,2135-01-06 10:38:00,2135-01-06 10:38:00,,,,not_duplicated,,0 days 06:02:00,
1537,11570331,26919892,30916265.0,2134-12-28 04:51:00,2135-01-03 17:15:00,224385.0,92.4,30916265.0,2135-01-04 08:54:00,225468.0,...,Unplanned Extuabtion (patient-initiated),2135-01-06 10:38:00,2135-01-06 10:38:00,,0 days 15:39:00,,not_duplicated,not_duplicated,5 days 02:13:00,5 days 17:57:00
1538,11570331,26919892,,2134-12-28 04:51:00,,,,30916265.0,2134-12-29 14:54:00,227194.0,...,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,,,,,duplicated,,0 days 00:07:00
1539,11570331,26919892,,2134-12-28 04:51:00,,,,30916265.0,2134-12-29 14:57:00,227194.0,...,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,,,0 days 00:00:00,,duplicated,,0 days 00:03:00


In [301]:
# test의 intubationtime이 null 값인 index만 가져온다.
for idx in test[test["intubationtime"].isna()].index:

    # 하나씩 가져온 index의 iloc, 즉 index location을 확인
    null_idx_loc = test.index.get_loc(idx)

    # test의 마지막 행 전까지 iloc을 확인한다.
    for idx2 in range(len(test)-1):

        # 1. 만약 idx2의 extubationtime 그리고 idx2+1 행의 intubationtime이 null 값이 아닌 것을 만족하고
        # 2. intubationtime이 null 값인 index의 extubationtime이 
        # idx2의 extubationtime보다 크며, idx2+1 행의 intubationtime보다 작은 것을 만족
        # 1과 2를 동시에 만족한다면,
        # null 값의 intubationtime은 해당 idx2의 extubationtime과 같다.
        if ((pd.notnull(test.iloc[idx2, ext_col]) and pd.notnull(test.iloc[idx2+1, int_col])) and 
            ((test.iloc[null_idx_loc, ext_col] > test.iloc[idx2, ext_col]) and (test.iloc[null_idx_loc, ext_col] < test.iloc[idx2+1, int_col]))):
            
            test.iloc[null_idx_loc, int_col] = test.iloc[idx2, ext_col]

            # intubationtime의 null 값이 하나씩 채워질 때마다 sorting 진행
            test.sort_values(by="intubationtime", ascending=True, inplace=True)

In [302]:
# 정렬이 된 df

test

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
1533,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-28 05:59:00,224385.0,92.4,,,,...,,2135-01-06 10:38:00,2135-01-06 10:38:00,,,0 days 00:00:00,not_duplicated,,,
1534,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-28 18:36:00,224385.0,92.4,30916265.0,2134-12-28 18:36:00,225477.0,...,Unplanned Extuabtion (non-patient initiated),2135-01-06 10:38:00,2135-01-06 10:38:00,,0 days 00:00:00,0 days 14:24:00,not_duplicated,not_duplicated,0 days 12:37:00,
1535,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-29 09:00:00,224385.0,92.4,30916265.0,2134-12-29 14:47:00,227194.0,...,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,,0 days 05:47:00,0 days 00:15:00,not_duplicated,duplicated,0 days 14:24:00,0 days 20:11:00
1538,11570331,26919892,,2134-12-28 04:51:00,2134-12-29 14:47:00,,,30916265.0,2134-12-29 14:54:00,227194.0,...,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,,,,,duplicated,,0 days 00:07:00
1539,11570331,26919892,,2134-12-28 04:51:00,2134-12-29 14:54:00,,,30916265.0,2134-12-29 14:57:00,227194.0,...,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,,,0 days 00:00:00,,duplicated,,0 days 00:03:00
1536,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-29 15:02:00,224385.0,92.4,,,,...,,2135-01-06 10:38:00,2135-01-06 10:38:00,,,,not_duplicated,,0 days 06:02:00,
1537,11570331,26919892,30916265.0,2134-12-28 04:51:00,2135-01-03 17:15:00,224385.0,92.4,30916265.0,2135-01-04 08:54:00,225468.0,...,Unplanned Extuabtion (patient-initiated),2135-01-06 10:38:00,2135-01-06 10:38:00,,0 days 15:39:00,,not_duplicated,not_duplicated,5 days 02:13:00,5 days 17:57:00


------

### group의 행이 2개 이상이고, int_time, ext_time 둘다 null 값이 존재할 시 예시

In [72]:
df_filter_exp[(df_filter_exp["subject_id"] == 10098215) & (df_filter_exp["hadm_id"] == 22394571)]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
68,10098215,22394571,32817342.0,2118-05-03 23:07:00,2118-05-09 19:02:00,224385.0,60.0,,,,...,,2118-07-16 12:15:00,,,,0 days 00:00:00,not_duplicated,,,
69,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-03 20:00:00,224385.0,55.4,,,,...,,2118-07-16 12:15:00,,,,,duplicated,,25 days 00:58:00,
70,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-03 20:50:00,224385.0,55.4,36018186.0,2118-06-06 16:20:00,227194.0,...,Planned Extubation,2118-07-16 12:15:00,,,2 days 19:30:00,0 days 09:20:00,duplicated,duplicated,0 days 00:50:00,
71,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-07 01:40:00,224385.0,55.4,36018186.0,2118-06-08 11:55:00,227194.0,...,Planned Extubation,2118-07-16 12:15:00,,,1 days 10:15:00,9 days 13:53:00,not_duplicated,not_duplicated,3 days 04:50:00,1 days 19:22:00
72,10098215,22394571,32301420.0,2118-05-03 23:07:00,2118-06-18 01:48:00,224385.0,48.7,32301420.0,2118-06-21 03:30:00,225468.0,...,Unplanned Extuabtion (patient-initiated),2118-07-16 12:15:00,,,3 days 01:42:00,0 days 04:58:00,not_duplicated,not_duplicated,11 days 00:08:00,12 days 15:35:00
73,10098215,22394571,32301420.0,2118-05-03 23:07:00,2118-06-21 08:28:00,224385.0,48.7,32301420.0,2118-06-27 12:45:00,227194.0,...,Planned Extubation,2118-07-16 12:15:00,,,6 days 04:17:00,,not_duplicated,not_duplicated,3 days 06:40:00,6 days 09:15:00
74,10098215,22394571,,2118-05-03 23:07:00,,,,36018186.0,2118-06-06 16:33:00,227194.0,...,Planned Extubation,2118-07-16 12:15:00,,,,0 days 00:00:00,,duplicated,,0 days 00:13:00


In [73]:
df_filter_exp[(df_filter_exp["subject_id"] == 10119017) & (df_filter_exp["hadm_id"] == 20229125)]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
95,10119017,20229125,31009936.0,2159-08-10 23:55:00,2159-08-14 06:16:00,224385.0,59.7,,,,...,,2159-08-20 21:40:00,2159-08-20 21:40:00,,,0 days 00:00:00,duplicated,,,
96,10119017,20229125,31009936.0,2159-08-10 23:55:00,2159-08-14 06:38:00,224385.0,59.7,31009936.0,2159-08-18 13:38:00,227194.0,...,Planned Extubation,2159-08-20 21:40:00,2159-08-20 21:40:00,,4 days 07:00:00,,duplicated,not_duplicated,0 days 00:22:00,6 days 20:26:00
97,10119017,20229125,,2159-08-10 23:55:00,,,,31009936.0,2159-08-11 17:12:00,227194.0,...,Planned Extubation,2159-08-20 21:40:00,2159-08-20 21:40:00,,,0 days 00:00:00,,not_duplicated,,


In [74]:
df_filter_exp[(df_filter_exp["subject_id"] == 18170491) & (df_filter_exp["hadm_id"] == 27238085)]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
7974,18170491,27238085,32996481.0,2135-05-01 07:46:00,2135-05-06 11:30:00,224385.0,63.4,32996481.0,2135-05-08 10:27:00,225468.0,...,Unplanned Extuabtion (patient-initiated),2135-05-22 15:25:00,,,1 days 22:57:00,0 days 00:00:00,not_duplicated,not_duplicated,,1 days 23:27:00
7975,18170491,27238085,32996481.0,2135-05-01 07:46:00,2135-05-08 10:28:00,224385.0,63.4,,,,...,,2135-05-22 15:25:00,,,,,not_duplicated,,1 days 22:58:00,
7976,18170491,27238085,,2135-05-01 07:46:00,,,,32996481.0,2135-05-06 11:00:00,227194.0,...,Planned Extubation,2135-05-22 15:25:00,,,,0 days 00:00:00,,not_duplicated,,


In [187]:
df_filter_exp[df_filter_exp["subject_id"] == 13921768]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
3827,13921768,29930128,32569450.0,2198-07-16 14:28:00,2198-07-26 10:45:00,224385.0,82.5,32569450.0,2198-07-27 05:46:00,227194.0,...,Planned Extubation,2198-08-01 15:00:00,,,0 days 19:01:00,0 days 00:00:00,not_duplicated,not_duplicated,,7 days 15:06:00
3828,13921768,29930128,32569450.0,2198-07-16 14:28:00,2198-07-27 06:53:00,224385.0,82.5,,,,...,,2198-08-01 15:00:00,,,,,not_duplicated,,0 days 20:08:00,
3829,13921768,29930128,,2198-07-16 14:28:00,,,,35689413.0,2198-07-19 14:40:00,227194.0,...,Planned Extubation,2198-08-01 15:00:00,,,,0 days 00:00:00,,not_duplicated,,


In [188]:
df_filter_exp[df_filter_exp["subject_id"] == 10119017]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
95,10119017,20229125,31009936.0,2159-08-10 23:55:00,2159-08-14 06:16:00,224385.0,59.7,,,,...,,2159-08-20 21:40:00,2159-08-20 21:40:00,,,0 days 00:00:00,duplicated,,,
96,10119017,20229125,31009936.0,2159-08-10 23:55:00,2159-08-14 06:38:00,224385.0,59.7,31009936.0,2159-08-18 13:38:00,227194.0,...,Planned Extubation,2159-08-20 21:40:00,2159-08-20 21:40:00,,4 days 07:00:00,,duplicated,not_duplicated,0 days 00:22:00,6 days 20:26:00
97,10119017,20229125,,2159-08-10 23:55:00,,,,31009936.0,2159-08-11 17:12:00,227194.0,...,Planned Extubation,2159-08-20 21:40:00,2159-08-20 21:40:00,,,0 days 00:00:00,,not_duplicated,,


In [189]:
df_filter_exp[df_filter_exp["subject_id"] == 12502618]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
2379,12502618,20865228,34357675.0,2127-01-04 22:18:00,2127-01-12 10:00:00,224385.0,64.0,,,,...,,2127-02-11 17:51:00,,,,0 days 00:00:00,not_duplicated,,,
2380,12502618,20865228,34357675.0,2127-01-04 22:18:00,2127-01-20 04:30:00,224385.0,64.0,,,,...,,2127-02-11 17:51:00,,,,,not_duplicated,,7 days 18:30:00,
2381,12502618,20865228,,2127-01-04 22:18:00,,,,34357675.0,2127-01-12 09:00:00,227194.0,...,Planned Extubation,2127-02-11 17:51:00,,,,0 days 00:00:00,,not_duplicated,,
2382,12502618,29296554,38790108.0,2119-02-16 05:45:00,2119-02-16 08:06:00,224385.0,62.1,38790108.0,2119-02-16 18:30:00,227194.0,...,Planned Extubation,2119-02-26 17:27:00,,,0 days 10:24:00,0 days 00:00:00,not_duplicated,not_duplicated,,


In [190]:
df_filter_exp[df_filter_exp["subject_id"] == 15573773]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
5396,15573773,24337088,34806651.0,2170-10-09 13:46:00,2170-10-09 15:00:00,224385.0,75.5,34806651.0,2170-10-16 10:45:00,227194.0,...,Planned Extubation,2170-10-25 13:45:00,,,6 days 19:45:00,0 days 00:00:00,not_duplicated,not_duplicated,,
5397,15573773,24337088,34806651.0,2170-10-09 13:46:00,2170-10-16 11:56:00,224385.0,75.5,,,,...,,2170-10-25 13:45:00,,,,0 days 00:00:00,not_duplicated,,6 days 20:56:00,
5398,15573773,26216420,34410366.0,2170-07-30 20:20:00,2170-08-02 15:39:00,224385.0,72.3,34410366.0,2170-08-06 11:05:00,227194.0,...,Planned Extubation,2170-08-16 10:30:00,,,3 days 19:26:00,0 days 00:00:00,not_duplicated,not_duplicated,,3 days 22:20:00
5399,15573773,26216420,34410366.0,2170-07-30 20:20:00,2170-08-09 17:15:00,224385.0,72.3,,,,...,,2170-08-16 10:30:00,,,,,not_duplicated,,7 days 01:36:00,
5400,15573773,26216420,,2170-07-30 20:20:00,,,,34410366.0,2170-08-02 12:45:00,227194.0,...,Planned Extubation,2170-08-16 10:30:00,,,,0 days 00:00:00,,not_duplicated,,


In [191]:
df_filter_exp[df_filter_exp["subject_id"] == 16483436]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
6363,16483436,20116402,37510531.0,2176-03-03 11:05:00,2176-03-09 17:18:00,224385.0,64.0,37510531.0,2176-03-14 13:45:00,227194.0,...,Planned Extubation,2176-04-04 14:40:00,,,4 days 20:27:00,0 days 00:00:00,not_duplicated,not_duplicated,,4 days 20:50:00
6364,16483436,20116402,37510531.0,2176-03-03 11:05:00,2176-03-14 14:11:00,224385.0,64.0,,,,...,,2176-04-04 14:40:00,,,,,not_duplicated,,4 days 20:53:00,
6365,16483436,20116402,,2176-03-03 11:05:00,,,,37510531.0,2176-03-09 16:55:00,225468.0,...,Unplanned Extuabtion (patient-initiated),2176-04-04 14:40:00,,,,0 days 00:00:00,,not_duplicated,,


In [197]:
test = df_filter_exp[df_filter_exp["subject_id"] == 11570331]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
1533,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-28 05:59:00,224385.0,92.4,,,,...,,2135-01-06 10:38:00,2135-01-06 10:38:00,,,0 days 00:00:00,not_duplicated,,,
1534,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-28 18:36:00,224385.0,92.4,30916265.0,2134-12-28 18:36:00,225477.0,...,Unplanned Extuabtion (non-patient initiated),2135-01-06 10:38:00,2135-01-06 10:38:00,,0 days 00:00:00,0 days 14:24:00,not_duplicated,not_duplicated,0 days 12:37:00,
1535,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-29 09:00:00,224385.0,92.4,30916265.0,2134-12-29 14:47:00,227194.0,...,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,,0 days 05:47:00,0 days 00:15:00,not_duplicated,duplicated,0 days 14:24:00,0 days 20:11:00
1536,11570331,26919892,30916265.0,2134-12-28 04:51:00,2134-12-29 15:02:00,224385.0,92.4,,,,...,,2135-01-06 10:38:00,2135-01-06 10:38:00,,,,not_duplicated,,0 days 06:02:00,
1537,11570331,26919892,30916265.0,2134-12-28 04:51:00,2135-01-03 17:15:00,224385.0,92.4,30916265.0,2135-01-04 08:54:00,225468.0,...,Unplanned Extuabtion (patient-initiated),2135-01-06 10:38:00,2135-01-06 10:38:00,,0 days 15:39:00,,not_duplicated,not_duplicated,5 days 02:13:00,5 days 17:57:00
1538,11570331,26919892,,2134-12-28 04:51:00,,,,30916265.0,2134-12-29 14:54:00,227194.0,...,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,,,,,duplicated,,0 days 00:07:00
1539,11570331,26919892,,2134-12-28 04:51:00,,,,30916265.0,2134-12-29 14:57:00,227194.0,...,Planned Extubation,2135-01-06 10:38:00,2135-01-06 10:38:00,,,0 days 00:00:00,,duplicated,,0 days 00:03:00


### 이전 로직

In [201]:
'''
int_stayid, int_itemid, int_weight, intubationtime
ext_stayid, ext_itemid, ext_weight, extubationtime

subject_id, hadm_id가 같아야 같은 환자 + 같은 시기로 인정한다.
int_stayid와 ext_stayid가 동시에 NaN값을 가지는 경우는 없는 것으로 확인 된다.
또한 stayid, itemid, weight의 경우 int와 ext는 상호보완이 가능함.
'''
'''
단, itemid인 경우 이전에 데이터 추출 시, intubation은 1개의 값, extubation은 3개의 값을 사용함.
int_itemid 값은 1개로 통일이 가능하지만, extubation은 3가지 경우의 수가 있고, 4개의 값이 모두 다른 번호를 가지기 때문에
대체하는 것에 대해 생각이 필요할 것으로 판단됨.
'''

# def filled_nan_values(group):
#     if pd.isna(group.loc[i, 'int_stayid']):
#         group.loc[i, 'int_stayid'] = group.loc[i, 'ext_stayid']
#         group.loc[i, 'int_weight'] = group.loc[i, 'ext_weight']
#     elif pd.isna(group.loc[i, 'ext_stayid']):
#         group.loc[i, 'ext_stayid'] = group.loc[i, 'int_stayid']
#         group.loc[i, 'ext_weight'] = group.loc[i, 'int_weight']       

'''
int_time, ext_time의 NaN 값 참고사항
* 같은 행에서 int_time, ext_time 둘다 NaN 값인 경우는 존재하지 않는다.
* groupby로 묶어 처리하는 방법을 수행한다.
* group 내부에 존재하는 intubationtime, extubationtime 의 NaN 값이 각각 2개를 넘지 않는다.
* intubationtime이 NaN 값인 경우, 같은 행의 extubationtime 값이
  group 내부에 존재하는 intubationtime 값보다 더 이전이다.
'''

def filled_nan_time(group):
    nan_intubation_count = group['intubationtime'].isna().sum()
    nan_extubation_count = group['extubationtime'].isna().sum()
    # groupby 내부에서 intubationtime의 NaN 값의 개수가 1개라면
    if nan_intubation_count == 1:
        # index를 변수로 지정 (가독성)
        nan_index = group.index[group['intubationtime'].isna()][0]
        # loc를 활용하여 해당 index의 intubationtime 값을 같은 인덱스의 admittime으로 대체
        group.loc[nan_index, 'intubationtime'] = group.loc[nan_index, 'admittime']
        # 해당 그룹을 정렬 진행
        group.sort_values(by=["intubationtime", "extubationtime"], inplace=True)
        
    # 또는 groupby 내부에서 intubationtime의 NaN 값의 개수가 2개라면    
    elif nan_intubation_count == 2:
        # index를 변수에 지정
        nan_indices = group.index[group['intubationtime'].isna()]
        # NaN 값의 첫 번째 인덱스는 같은 행의 admittime으로 대체
        group.loc[nan_indices[0],'intubationtime'] = group.loc[nan_indices[0],'admittime']
        # NaN 값의 두 번째 인덱스는 이전 행의 extubationtime 값으로 대체
        group.loc[nan_indices[1],'intubationtime'] = group.loc[nan_indices[1] - 1,'extubationtime']
        # 정렬
        group.sort_values(by=["intubationtime", "extubationtime"], inplace=True)

    # groupby 내부에서 extubationtime의 NaN 값의 개수가 1개라면    
    if group['extubationtime'].isna().sum() == 1:        
        # index 변수 지정
        nan_index = group.index[group['extubationtime'].isna()][0]
        # 해당 index의 extubationtime 값을 같은 인덱스의 dischtime (퇴원시간) 으로 대체
        group.loc[nan_index, 'extubationtime'] = group.loc[nan_index,'dischtime']

    # 또는 groupby 내부에서 extubationtime의 NaN 값의 개수가 2개라면    
    elif group['extubationtime'].isna().sum() == 2:
        # index를 변수에 지정
        nan_indices = group.index[group['extubationtime'].isna()]
        # NaN 값 첫 번째 index의 extubationtime 값을 다음 행의 intubationtime 값으로 대체
        group.loc[nan_indices[0],'extubationtime'] = group.loc[nan_indices[0]+1,'intubationtime']
        # NaN 값 두 번째 index의 extubationtime 값을 같은 행의 dischtime 값으로 대체
        group.loc[nan_indices[1],'extubationtime'] = group.loc[nan_indices[1],'dischtime']

    return group

In [194]:
sorted_filled_nan = df_filter_exp.groupby(["subject_id", "hadm_id"], group_keys=False).apply(filled_nan_time)

In [207]:
sorted_filled_nan.head()

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,...,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,ext_dup,int_timediff,ext_timediff
0,10001884,26184834,37510196.0,2131-01-07 20:39:00,2131-01-11 04:30:00,224385.0,65.0,37510196.0,2131-01-12 17:40:00,227194.0,...,Planned Extubation,2131-01-20 05:15:00,2131-01-20 05:15:00,,1 days 13:10:00,0 days 00:00:00,not_duplicated,not_duplicated,,
1,10003400,23559586,38383343.0,2137-08-04 00:07:00,2137-08-17 21:21:00,224385.0,90.5,38383343.0,2137-08-21 15:40:00,227194.0,...,Planned Extubation,2137-09-02 17:05:00,2137-09-02 17:05:00,,3 days 18:19:00,0 days 00:00:00,not_duplicated,not_duplicated,,
2,10004401,27939719,31202136.0,2144-04-11 03:31:00,2144-04-11 05:03:00,224385.0,120.0,,2144-04-13 17:31:00,,...,,2144-04-13 17:31:00,,,,0 days 00:00:00,not_duplicated,,,
3,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-27 19:00:00,224385.0,76.0,32773003.0,2144-01-30 12:30:00,227194.0,...,Planned Extubation,2144-02-06 11:45:00,,,2 days 17:30:00,0 days 00:00:00,not_duplicated,not_duplicated,,
4,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-30 13:00:00,224385.0,76.0,,2144-02-06 11:45:00,,...,,2144-02-06 11:45:00,,,,0 days 00:00:00,not_duplicated,,2 days 18:00:00,


In [206]:
sorted_filled_nan.to_csv('./Data/output/sorted_filled_nan.csv', index=False)