# 목표
1. 중복 이벤트 삭제 함수 만들기
2. Replacement 함수 만들기
3. Reintubation 함수 만들기

In [584]:
intubation1 = pd.read_csv('./Data/extubation1.csv')

In [585]:
extubation1 = pd.read_csv('./Data/extubation1.csv')

## 1. 삽관 및 발관 중복 이벤트 삭제하기

### Workflow 1-1
* 기존에 있는 intubation1, extubation1 dataframe으로 함수 작성

* 중복 제거 후, time_diff를 구하기 위해 dtype 변경

* loc를 원활히 활용하기 위해 index 초기화 진행

* 첫 번째 행 (0번째 행)은 겹칠 일이 없기에 not_duplicated로 지정

* for문을 활용하여 index를 순차적 조회
    * prev_row, current_row의 subject_id, hadm_id가 같다면 같은 환자 및 시기로 판단
    * time_difference = int_time or ext_time의 current_row - prev_row
    * time_difference <= 1 이라면, duplicate로 지정
        * 추후 확인을 위해 time_difference <= 1 인 row와 prev_row 두개 duplicate로 지정

In [618]:
import pandas as pd
intubation1 = pd.read_csv('./Data/intubation1.csv')
extubation1 = pd.read_csv('./Data/extubation1.csv')

In [619]:
def time_duplicate_event(Original_dataframe, time_col):
    
    # DataFrame 중복 제거
    origin_df = Original_dataframe.drop_duplicates()

    # time delta를 구하기 위해 dtype 변경
    origin_df.loc[:,time_col] = pd.to_datetime(origin_df[time_col])

    # subject_id, hadm_id, int_time(or ext_time) 순으로 정렬
    sorted_df = origin_df.sort_values(by=["subject_id", "hadm_id", time_col], ascending=True)

    # index 초기화 (loc 함수를 원활히 사용하기 위함)
    sorted_df = sorted_df.reset_index(drop=True)
    
    length_df = len(sorted_df)

    # 첫 번째 행 (0번째 행)은 겹칠일이 없기에 not_duplicated 지정
    sorted_df.loc[0, "dup"] = "not_duplicated"

    # dataframe의 길이를 가지고 for문 시작.
    for idx in range(1, (length_df)):
        
        # 현재 행과 이전 행의 subject_id가 같고 hadm_id가 같다는 조건을 동시에 만족한다면
        # 이는 같은 환자로 인정
        if (sorted_df.loc[idx, "subject_id"] == sorted_df.loc[idx-1, "subject_id"]) & \
        (sorted_df.loc[idx, "hadm_id"] == sorted_df.loc[idx-1, "hadm_id"]):
            time_diff = sorted_df.loc[idx, time_col] - sorted_df.loc[idx-1, time_col]

            # time_diff가 1시간 이내라면 duplicate, 아니라면 not_duplicate
            if time_diff <= pd.Timedelta(hours=1):
                sorted_df.loc[idx, "dup"] = "duplicated"
                sorted_df.loc[idx-1, "dup"] = "duplicated"
            else:
                sorted_df.loc[idx, "dup"] = "not_duplicated"
                
        # 현재 행과 이전 행의 subject_id, hadm_id가 다르면 not_duplicate
        else:
            sorted_df.loc[idx, "dup"] = "not_duplicated"

    return sorted_df

In [620]:
intubation1_exp = intubation1.copy()
extubation1_exp = extubation1.copy()

In [621]:
start_time = time.time()

filtered_intubation = time_duplicate_event(intubation1_exp, "intubationtime")
filtered_extubation = time_duplicate_event(extubation1_exp, "extubationtime")

print(f"intubation_shape = {filtered_intubation[filtered_intubation['dup'] == 'duplicated'].shape}")
print(f"extubation_shape = {filtered_extubation[filtered_extubation['dup'] == 'duplicated'].shape}")

end_time = time.time()
wait_time = end_time - start_time

print("----")
print(f"경과 시간 : {round(wait_time, 2)} sec")


intubation_shape = (423, 7)
extubation_shape = (763, 7)
----
경과 시간 : 5.75 sec


In [622]:
filtered_intubation[filtered_intubation["dup"] == "duplicated"]

Unnamed: 0,subject_id,hadm_id,stay_id,intubationtime,itemid,patientweight,dup
18,10021927,24623461,34575919,2180-09-20 15:43:00,224385,56.6,duplicated
19,10021927,24623461,34575919,2180-09-20 16:00:00,224385,56.6,duplicated
54,10089244,29469323,33563887,2128-02-11 12:22:00,224385,90.0,duplicated
55,10089244,29469323,33563887,2128-02-11 12:30:00,224385,90.0,duplicated
63,10098215,22394571,36018186,2118-06-03 20:00:00,224385,55.4,duplicated
...,...,...,...,...,...,...,...
8085,19592126,23077014,30077165,2121-11-29 05:30:00,224385,102.4,duplicated
8229,19747328,29300235,38984649,2154-01-05 12:17:00,224385,103.9,duplicated
8230,19747328,29300235,38984649,2154-01-05 12:24:00,224385,103.9,duplicated
8415,19969918,29544887,31213286,2186-01-16 20:29:00,224385,68.0,duplicated


In [623]:
filtered_extubation[filtered_extubation["dup"] == "duplicated"]

Unnamed: 0,subject_id,hadm_id,stay_id,extubationtime,itemid,patientweight,dup
48,10023486,25262533,31361200,2151-06-08 18:00:00,227194,139.9,duplicated
49,10023486,25262533,31361200,2151-06-08 18:03:00,227194,139.9,duplicated
59,10027602,28166872,32391858,2201-11-07 13:40:00,227194,64.0,duplicated
60,10027602,28166872,32391858,2201-11-07 13:53:00,227194,64.0,duplicated
79,10038933,25129047,32166508,2148-09-11 14:00:00,227194,123.0,duplicated
...,...,...,...,...,...,...,...
22577,19807790,27131460,38192208,2148-01-23 09:45:00,225468,52.6,duplicated
22912,19952171,27208415,32980185,2177-02-02 23:27:00,227194,89.0,duplicated
22913,19952171,27208415,32980185,2177-02-02 23:35:00,227194,89.0,duplicated
23025,19997752,29452285,34531437,2128-03-02 18:08:00,227194,86.0,duplicated


In [635]:
filtered_intubation.to_csv('./Data/output/intubation1_dup.csv')
filtered_extubation.to_csv('./Data/output/extubation1_dup.csv')
filtered_intubation[filtered_intubation["dup"] == "duplicated"].to_csv('./Data/output/intubation1_dup_filter.csv')
filtered_extubation[filtered_extubation["dup"] == "duplicated"].to_csv('./Data/output/extubation1_dup_filter.csv')

### Workflow1-2
* pat_col1, pat_col2 추가
    * subject_id는 동일하나 int_stayid, ext_stayid 가 각각 존재하므로 안전하게 지정을 해주는 것으로 진행


* 원본 copy후, 안전하게 진행

* timedelta(time_difference)를 구하기 위해 dtype 변경

* time_col1(intubation), time_col2(extubation) 각각 Dataframe의 null 값을 없앤 후 새로운 Dataframe으로 진행
    * 현재는 intubation만 사용 예정

* 각 dataframe의 index list를 변수에 저장 (loc 활용 용이하도록)

* 첫 번째 행은 겹칠일이 없기에 not_duplicated 지정

* for문을 활용하여 index를 순차적 조회
    * col1_df (intubationtime이 notnull 처리된 dataframe)에 loc 활용
    * time_difference <= 1이라면, duplicate로 지정
        * 추후 확인을 위해 time_difference <= 1 인 row와 prev_row 두 개 duplicate로 지정정

In [625]:
alignment_df = pd.read_csv('./Data/alignment_df_raw_new.csv', index_col="Unnamed: 0")

In [626]:
def time_duplicate_event_new(Original_dataframe, time_col1, time_col2, pat_col1, pat_col2):
    # 원본을 copy하여 복제한 df로 진행
    origin_df = Original_dataframe.copy()

    # timedelta를 구하기 위해 데이터 타입 변경
    origin_df[time_col1] = pd.to_datetime(origin_df[time_col1])
    origin_df[time_col2] = pd.to_datetime(origin_df[time_col2])

    # intubation, extubation 각각 null 값을 없앤 후 새로운 df로 진행
    col1_df = origin_df[origin_df[time_col1].notnull()]
    col2_df = origin_df[origin_df[time_col2].notnull()]

    # loc에 활용할 index 리스트 반환 
    col1_idx_list = col1_df.index.to_list()
    col2_idx_list = col2_df.index.to_list()
    
    
    # 첫 번째 행 (0번째 행)은 겹칠일이 없기에 not_duplicated 지정
    origin_df.loc[0, "int_dup"] = "not_duplicated"
    #origin_df.loc[0, "ext_dup"] = "not_duplicated"

    # dataframe의 길이를 가지고 for문 시작.
    for idx in range(1, (len(col1_df))):

        if (col1_df.loc[col1_idx_list[idx], "subject_id"] == col1_df.loc[col1_idx_list[idx-1], "subject_id"]) & (col1_df.loc[col1_idx_list[idx], pat_col1] == col1_df.loc[col1_idx_list[idx-1], pat_col1]):
        
            time_diff = col1_df.loc[col1_idx_list[idx], time_col1] - col1_df.loc[col1_idx_list[idx-1], time_col1]
            
            origin_df.loc[col1_idx_list[idx], "int_timediff"] = time_diff
            
            if time_diff <= pd.Timedelta(hours = 1):
                origin_df.loc[col1_idx_list[idx], "int_dup"] = "duplicated"
                origin_df.loc[col1_idx_list[idx-1], "int_dup"] = "duplicated"
                
            else:
                origin_df.loc[col1_idx_list[idx], "int_dup"] = "not_duplicated"
                
        else:
            origin_df.loc[col1_idx_list[idx], "int_dup"] = "not_duplicated"

    # for idx in range(1, (len(col2_df))):

    #     if (col2_df.loc[col2_idx_list[idx], "subject_id"] == col2_df.loc[col2_idx_list[idx-1], "subject_id"]) & (col2_df.loc[col2_idx_list[idx], pat_col2] == col2_df.loc[col2_idx_list[idx-1], pat_col2]):
        
    #         time_diff = col2_df.loc[col2_idx_list[idx], time_col2] - col2_df.loc[col2_idx_list[idx-1], time_col2]
            
    #         origin_df.loc[col2_idx_list[idx], "ext_timediff"] = time_diff
            
    #         if time_diff <= pd.Timedelta(hours = 1):
    #             origin_df.loc[col2_idx_list[idx], "ext_dup"] = "duplicated"
    #             origin_df.loc[col1_idx_list[idx-1], "ext_dup"] = "duplicated"
                
    #         else:
    #             origin_df.loc[col2_idx_list[idx], "ext_dup"] = "not_duplicated"
                
    #     else:
    #         origin_df.loc[col2_idx_list[idx], "ext_dup"] = "not_duplicated"    
    
    return origin_df

In [627]:
alignment_df_exp = alignment_df.copy()

In [628]:
# 함수 적용 후 확인

filter_df = time_duplicate_event_new(alignment_df_exp, "intubationtime", "extubationtime", "int_stayid", "ext_stayid")
filter_df

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,int_timediff
0,10001884,26184834,37510196.0,2131-01-07 20:39:00,2131-01-11 04:30:00,224385.0,65.0,37510196.0,2131-01-12 17:40:00,227194.0,65.0,Planned Extubation,2131-01-20 05:15:00,2131-01-20 05:15:00,,1 days 13:10:00,0 days 00:00:00,not_duplicated,NaT
1,10003400,23559586,38383343.0,2137-08-04 00:07:00,2137-08-17 21:21:00,224385.0,90.5,38383343.0,2137-08-21 15:40:00,227194.0,99.6,Planned Extubation,2137-09-02 17:05:00,2137-09-02 17:05:00,,3 days 18:19:00,0 days 00:00:00,not_duplicated,NaT
2,10004401,27939719,31202136.0,2144-04-11 03:31:00,2144-04-11 05:03:00,224385.0,120.0,,NaT,,,,2144-04-13 17:31:00,,,,0 days 00:00:00,not_duplicated,NaT
3,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-27 19:00:00,224385.0,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,,,2 days 17:30:00,0 days 00:00:00,not_duplicated,NaT
4,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-30 13:00:00,224385.0,76.0,,NaT,,,,,,,,0 days 00:00:00,not_duplicated,2 days 18:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9742,19997367,20617667,35616526.0,2126-04-20 07:15:00,2126-05-05 18:20:00,224385.0,59.0,35616526.0,2126-05-08 17:29:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,,,2 days 23:09:00,,not_duplicated,8 days 22:06:00
9743,19997367,20617667,,,NaT,,,35616526.0,2126-04-22 08:23:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,,,,0 days 00:00:00,,NaT
9744,19999068,21606769,30143796.0,2161-08-24 04:10:00,2161-08-25 15:34:00,224385.0,55.8,30143796.0,2161-08-28 13:35:00,227194.0,55.8,Planned Extubation,2161-09-02 19:00:00,,,2 days 22:01:00,0 days 00:00:00,not_duplicated,NaT
9745,19999442,26785317,32336619.0,2148-11-19 10:00:00,2148-11-19 19:00:00,224385.0,107.5,32336619.0,2148-11-20 14:15:00,227194.0,107.5,Planned Extubation,2148-12-04 16:25:00,,,0 days 19:15:00,0 days 00:00:00,not_duplicated,NaT


In [631]:
# intubationtime이 not null인 df 확인

filter_df[filter_df['intubationtime'].notnull()]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,int_timediff
0,10001884,26184834,37510196.0,2131-01-07 20:39:00,2131-01-11 04:30:00,224385.0,65.0,37510196.0,2131-01-12 17:40:00,227194.0,65.0,Planned Extubation,2131-01-20 05:15:00,2131-01-20 05:15:00,,1 days 13:10:00,0 days 00:00:00,not_duplicated,NaT
1,10003400,23559586,38383343.0,2137-08-04 00:07:00,2137-08-17 21:21:00,224385.0,90.5,38383343.0,2137-08-21 15:40:00,227194.0,99.6,Planned Extubation,2137-09-02 17:05:00,2137-09-02 17:05:00,,3 days 18:19:00,0 days 00:00:00,not_duplicated,NaT
2,10004401,27939719,31202136.0,2144-04-11 03:31:00,2144-04-11 05:03:00,224385.0,120.0,,NaT,,,,2144-04-13 17:31:00,,,,0 days 00:00:00,not_duplicated,NaT
3,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-27 19:00:00,224385.0,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,,,2 days 17:30:00,0 days 00:00:00,not_duplicated,NaT
4,10004401,29988601,32773003.0,2144-01-23 07:58:00,2144-01-30 13:00:00,224385.0,76.0,,NaT,,,,,,,,0 days 00:00:00,not_duplicated,2 days 18:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9741,19997367,20617667,35616526.0,2126-04-20 07:15:00,2126-04-26 20:14:00,224385.0,59.0,35616526.0,2126-05-02 16:30:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,,,5 days 20:16:00,0 days 00:00:00,not_duplicated,NaT
9742,19997367,20617667,35616526.0,2126-04-20 07:15:00,2126-05-05 18:20:00,224385.0,59.0,35616526.0,2126-05-08 17:29:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,,,2 days 23:09:00,,not_duplicated,8 days 22:06:00
9744,19999068,21606769,30143796.0,2161-08-24 04:10:00,2161-08-25 15:34:00,224385.0,55.8,30143796.0,2161-08-28 13:35:00,227194.0,55.8,Planned Extubation,2161-09-02 19:00:00,,,2 days 22:01:00,0 days 00:00:00,not_duplicated,NaT
9745,19999442,26785317,32336619.0,2148-11-19 10:00:00,2148-11-19 19:00:00,224385.0,107.5,32336619.0,2148-11-20 14:15:00,227194.0,107.5,Planned Extubation,2148-12-04 16:25:00,,,0 days 19:15:00,0 days 00:00:00,not_duplicated,NaT


In [633]:
# duplicated로 지정된 행만 확인

filter_df[filter_df["int_dup"] == "duplicated"]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime,marker,int_ext_timediff,reintubationtime,int_dup,int_timediff
20,10021927,24623461,34575919.0,2180-09-20 10:01:00,2180-09-20 15:43:00,224385.0,56.6,,NaT,,,,,,,,0 days 00:00:00,duplicated,NaT
21,10021927,24623461,34575919.0,2180-09-20 10:01:00,2180-09-20 16:00:00,224385.0,56.6,34575919.0,2180-09-27 10:00:00,227194.0,56.6,Planned Extubation,2180-09-27 10:32:00,2180-09-27 10:32:00,,6 days 18:00:00,0 days 00:00:00,duplicated,0 days 00:17:00
60,10089244,29469323,33563887.0,2128-02-09 21:50:00,2128-02-11 12:22:00,224385.0,90.0,,NaT,,,,,,,,0 days 00:00:00,duplicated,NaT
61,10089244,29469323,33563887.0,2128-02-09 21:50:00,2128-02-11 12:30:00,224385.0,90.0,,NaT,,,,,,,,0 days 00:00:00,duplicated,0 days 00:08:00
69,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-03 20:00:00,224385.0,55.4,,NaT,,,,,,,,,duplicated,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9331,19592126,23077014,30077165.0,2121-11-28 13:32:00,2121-11-29 05:30:00,224385.0,102.4,30077165.0,2121-12-01 14:00:00,227194.0,102.4,Planned Extubation,2121-12-01 14:00:00,2121-12-01 14:00:00,,2 days 08:30:00,0 days 00:00:00,duplicated,0 days 00:30:00
9500,19747328,29300235,38984649.0,2154-01-04 14:50:00,2154-01-05 12:17:00,224385.0,103.9,,NaT,,,,,,,,0 days 00:00:00,duplicated,NaT
9501,19747328,29300235,38984649.0,2154-01-04 14:50:00,2154-01-05 12:24:00,224385.0,103.9,38984649.0,2154-01-12 10:25:00,227194.0,103.9,Planned Extubation,2154-02-12 09:05:00,2154-02-12 09:05:00,,6 days 22:01:00,,duplicated,0 days 00:07:00
9713,19969918,29544887,31213286.0,2186-01-15 23:00:00,2186-01-16 20:29:00,224385.0,68.0,,NaT,,,,,,,,0 days 00:00:00,duplicated,NaT


In [634]:
# filter_df 전체 저장

filter_df.to_csv('./Data/output/alignment_dup_dataframe.csv')

# intubation time만 따로 저장
filter_df[filter_df['intubationtime'].notnull()].to_csv('./Data/output/alignment_dup_int_time.csv')

# intubation time과 duplicated 된 행만 따로 저장
filter_df[filter_df["int_dup"] == "duplicated"].to_csv('./Data/output/alignment_filter_dup.csv')