In [1]:
import pandas as pd

In [119]:
adults_icu = pd.read_csv('../outputs/adults_icu2.csv')
intubation_extubation = pd.read_csv('../outputs/intubation_extubation.csv')
intubation_extubation_stay = pd.read_csv('../outputs/intubation_extubation_stay.csv')

print(adults_icu.shape)
print(intubation_extubation.shape)
print(intubation_extubation_stay.shape)

# 시간변수 바꿔주기
intubation_extubation['int_ext_time'] = pd.to_timedelta(intubation_extubation['int_ext_time'])
intubation_extubation['intubationtime'] = pd.to_datetime(intubation_extubation['intubationtime'])
intubation_extubation['extubationtime'] = pd.to_datetime(intubation_extubation['extubationtime'])

intubation_extubation = intubation_extubation.drop(columns=['int_ext_time'])

intubation_extubation.info()

(73181, 13)
(10928, 15)
(10111, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10928 entries, 0 to 10927
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   subject_id       10928 non-null  int64         
 1   hadm_id          10928 non-null  int64         
 2   int_stayid       10928 non-null  int64         
 3   admittime        10928 non-null  object        
 4   intubationtime   10928 non-null  datetime64[ns]
 5   int_itemid       10928 non-null  int64         
 6   int_weight       10928 non-null  float64       
 7   ext_stayid       9387 non-null   float64       
 8   extubationtime   9387 non-null   datetime64[ns]
 9   ext_itemid       9387 non-null   float64       
 10  ext_weight       9387 non-null   float64       
 11  extubationcause  9387 non-null   object        
 12  dischtime        10928 non-null  object        
 13  deathtime        2950 non-null   object        
dtypes:

#### Subject level

- 개별 환자 데이터 기준으로 전처리 코드 작성 중에 있음.
- 1) 하나의 hadm_id에 1개의 intubation-extubation pair가 존재하는 경우 (코드 완성):
    - 발관 시간에 결측치가 존재하는가? 
        - deathtime, dischtime 등 대체 가능한 값이 존재하는가? 
    - 삽관-발관 시간 순서가 맞는가?

- 2) 하나의 hadm_id 에 2개 이상의 intubation-extubation pair가 존재하는 경우(즉, 삽관/발관이 여러차례 이루어진 경우), 아래 pseudocode와 같은 정렬 알고리즘을 사용해서 작업해보고자 함 (작업 중).
    - 1. hadm_id 별로 데이터를 groupby 해줌.
    - 2. 각 hadm_id 그룹 별로 intubationtime, extubationtime의 고유값(unique value) 추출
        - 전제: intubation, extubation time의 중복값은 오류로 취급
    - 3. intubationtime과 extubationtime 에 대해 아래와 같은 정렬 알고리즘 적용 (아직 여러 subject 케이스로 테스트 필요)


```
# 정렬 알고리즘
function find_pairs(start_times, end_times):
    # 빈 리스트 pairs 생성
    pairs = []
    # end_times의 인덱스를 추적하기 위한 변수 초기화
    end_time_index = []

    # start_times의 모든 요소에 대해 반복
    for each start_time in start_times:
        # end_times 리스트의 끝에 도달할 때까지 반복
        while end_time_index < length of end_times:
            # 현재 start_time보다 늦은 end_time 찾기
            if end_times[end_time_index] > start_time:
                # 다음 pair의 start_time과 겹치지 않는지 확인
                if end_time_index + 1 < length of end_times and end_times[end_time_index] > start_times[end_time_index + 1]:
                    break
                # 조건을 만족하는 pair를 pairs 리스트에 추가
                append (start_time, end_times[end_time_index]) to pairs
                # 다음 end_time으로 이동
                increment end_time_index
                break
            else:
                # 다음 end_time으로 이동
                increment end_time_index
        
        # 적절한 end_time을 찾지 못한 경우 start_time을 null과 짝지음
        if end_time_index >= length of end_times:
            append (start_time, null) to pairs
    
    # 모든 pair를 포함한 리스트 반환
    return pairs


```

### BASE CODE

In [128]:

## ROW 단위로 적용되는 함수들

def impute_extubationtime(row):
    """
    누락된 extubationtime을 1) deathtime 2) dischtime 순으로 대체합니다.
    이 함수는 수정된 row를 반환하고 원본 DataFrame은 변경하지 않습니다.

    Args:
    row (pd.Series): DataFrame의 행입니다.

    Returns:
    pd.Series: 수정된 행입니다.
    String: 결측치를 대체한 값에 대한 설명입니다 (deathtime, dischtime) 메인함수에서 logging을 위해 사용됩니다.
    """
    modified_row = row.copy()
    impute_with = None   # 대체값 초기화
    
    if pd.isnull(row['extubationtime']):
        if pd.notnull(row['deathtime']):
            modified_row['extubationtime'] = row['deathtime']
            impute_with = 'deathtime'
        else:
            modified_row['extubationtime'] = row['dischtime']
            impute_with = 'dischtime'

    return modified_row, impute_with


def validate_time_difference(row, df, log_messages):
    """
    intubationtime과 extubationtime 사이의 시간 차이를 검증합니다.
    extubationtime이 intubationtime 보다 뒤에 있어야 합니다. 
    그렇지 않을 경우 log_message가 저장됩니다.  

    Args:
    row (pd.Series): DataFrame의 행입니다.
    df (pd.DataFrame): 처리 중인 DataFrame입니다.
    log_messages (list): 로그 메시지를 저장하는 리스트입니다.

    Returns:
    bool: 시간 차이가 유효한지 여부를 반환합니다 (True: 유효, False: 무효).

    """
    subject_id = row['subject_id']
    hadm_id = row['hadm_id']
    index = row.name

    # 시간 차이 계산
    time_diff = pd.to_datetime(df.at[index, 'extubationtime']) - pd.to_datetime(row['intubationtime'])
    if time_diff <= pd.Timedelta(0):
        log_messages.append(f"{index}, {subject_id}, {hadm_id}: 유효하지 않은 데이터 (extubationtime <= intubationtime)")

    return time_diff >= pd.Timedelta(0)

## GROUP 단위로 적용되는 함수들

def count_unique_events(group):
    """
    주어진 그룹에서 고유한 (시간이 중복되지 않는) intubation 및 extubation 이벤트의 수를 "개별적으로" 계산합니다 (null 값 포함).

    Args:
    group (DataFrame): 특정 그룹화된 DataFrame입니다.

    Returns:
    List: 고유한 intubation 시간과 고유한 extubation 시간의 리스트
    """
    # intubationtime 및 extubationtime의 고유한 값과 Null 값을 계산
    unique_intubations = group['intubationtime'].unique()
    unique_extubations = group['extubationtime'].unique()
    
    isna_extubations_count = group['extubationtime'].isna().sum()
    unique_int_count = len(unique_intubations)
    unique_ext_count = len(unique_extubations)

    # 결과 출력
    print(f"고유한 intubation 이벤트 수: {unique_int_count}")
    print(f"고유한 extubation 이벤트 수: {unique_ext_count}")
    print(f"extubation 결측치 수: {isna_extubations_count}")

    print(f'--고유한 Intubation 이벤트--')
    print(unique_intubations)
    print(f'--고유한 Extubation 이벤트--')
    print(unique_extubations)

    return unique_intubations, unique_extubations


def handle_single_event_cases(group, modified_df):
    """
    처리 그룹에서 단일 이벤트 케이스를 처리합니다.

    Args:
    group (pd.DataFrame): 처리할 그룹 DataFrame.
    modified_df (pd.DataFrame): 수정된 DataFrame.

    Returns:
    pd.DataFrame: 수정된 DataFrame.
    list: 생성된 로그 메시지 리스트.
    """
    log_messages = []
    index = group.index[0]  # 행의 인덱스 가져오기
    row = group.iloc[0]     # 처리할 행 가져오기

    # impute_extubationtime 함수를 호출하여 row 수정 및 impute_with 값 받기
    modified_row, impute_with = impute_extubationtime(row)
    modified_df.loc[index] = modified_row  # 수정된 row로 DataFrame 업데이트

    # 로그 메시지 추가
    if row['extubationtime'] != modified_row['extubationtime']:
        log_message = f"row:{index}, sub_id:{group['subject_id'].iloc[0]}, hadm_id:{group['hadm_id'].iloc[0]}: extubationtime {impute_with}로 수정됨"
        log_messages.append(log_message)

    return modified_df, log_messages


# 정렬 알고리즘
# def find_pairs(unique_intubations, unique_extubations):
#     """
#     고유한 삽관 시간과 발관 시간을 짝지어 주는 함수입니다.

#     Args:
#     unique_intubations (list): 고유한 삽관 시간의 리스트.
#     unique_extubations (list): 고유한 발관 시간의 리스트.

#     Returns:
#     list: 삽관 시간과 발관 시간이 짝지어진 리스트. 발관 시간이 적절하지 않은 경우 None으로 표시됩니다.
#     """

#     pairs = []  # 삽관/발관 페어를 저장할 빈 리스트
#     ext_index = 0  # 발관 시간의 인덱스

#     # 모든 삽관 시간에 대하여
#     for int_index, int_time in enumerate(unique_intubations):
#         # 발관 시간 리스트를 순회
#         while ext_index < len(unique_extubations):
#             # 현재 발관 시간이 삽관 시간보다 늦을 경우
#             if unique_extubations[ext_index] > int_time:
#                 # 다음 삽관 시간과 현재 발관 시간을 비교
#                 if (int_index + 1 < len(unique_intubations) and 
#                     unique_extubations[ext_index] > unique_intubations[int_index + 1]):
#                     break
#                 # 조건을 만족하는 경우, 짝을 pairs 리스트에 추가
#                 pairs.append((int_time, unique_extubations[ext_index]))
#                 ext_index += 1  # 다음 발관 시간으로 이동
#                 break
#             else:
#                 ext_index += 1  # 다음 발관 시간으로 이동
        
#         # 적절한 발관 시간을 찾지 못한 경우, None으로 짝을 지음
#         if ext_index >= len(unique_extubations):
#             pairs.append((int_time, None))
    
#     return pairs  # 짝지어진 리스트 반환

# 정렬 알고리즘 v2
def find_pairs(unique_intubations, unique_extubations):
    """
    고유한 삽관 시간과 발관 시간을 쌍으로 짝지어 주는 함수입니다.
        조건 1. 삽관 시간보다 발관 시간이 뒤에 발생해야 함.
        조건 2. 현재 행의 발관 시간이 다음 행의 삽관 시간과 같거나 빨라야 함.

    Args:
    unique_intubations (list): 고유한 삽관 시간의 리스트.
    unique_extubations (list): 고유한 발관 시간의 리스트.

    Returns:
    list: 삽관 시간과 발관 시간이 짝지어진 리스트.
          적절한 발관 시간을 찾지 못할 경우, 발관 시간은 None으로 표시됩니다.
    """
    pairs = []  # 삽관과 발관 시간의 쌍을 저장할 리스트
    ext_index = 0  # 발관 시간 리스트를 탐색하기 위한 인덱스

    # 모든 삽관 시간에 대하여 반복
    for int_index, int_time in enumerate(unique_intubations):
        matched = False  # 현재 삽관 시간에 대한 적절한 발관 시간을 찾았는지 여부

        # 발관 시간 리스트를 순회
        while ext_index < len(unique_extubations) and not matched:
            ext_time = unique_extubations[ext_index]

            # 현재 발관 시간이 삽관 시간보다 늦은지 확인
            if ext_time > int_time:
                # 마지막 삽관 시간인지, 또는 현재 발관 시간이 다음 삽관 시간보다 이른지 확인
                if int_index + 1 == len(unique_intubations) or ext_time < unique_intubations[int_index + 1]:
                    pairs.append((int_time, ext_time))  # 쌍으로 추가
                    matched = True  # 적절한 쌍을 찾음
                else:
                    # 현재 발관 시간이 다음 삽관 시간보다 늦음, 더 이상 진행하지 않음
                    break

            ext_index += 1  # 다음 발관 시간으로 이동

        # 적절한 발관 시간을 찾지 못한 경우
        if not matched:
            pairs.append((int_time, None))  # 삽관 시간과 None을 쌍으로 추가

    return pairs  # 짝지어진 삽관/발관 시간 리스트 반환

# def extract_row_data(group, intubation_time, extubation_time):
#     # Filter the group based on the intubation and extubation times
#     filtered_group = group[(group['intubationtime'] == intubation_time) & (group['extubationtime'] == extubation_time)]

#     # If no matching row is found, return an empty dict or handle accordingly
#     if filtered_group.empty:
#         return {}

#     row_data = filtered_group.iloc[0]

#     # Handle null extubation time
#     if pd.isnull(extubation_time):
#         for col in ['ext_stayid', 'extubationtime', 'ext_itemid', 'ext_weight', 'extubationcause']:
#             row_data[col] = None

#     # Handle null intubation time
#     if pd.isnull(intubation_time):
#         for col in ['int_stayid', 'admittime', 'intubationtime', 'int_itemid', 'int_weight']:
#             row_data[col] = None

#     return row_data.to_dict()

# 수정 필요
def extract_row_data(group, intubation_time, extubation_time):
    # Initialize a dictionary to store the row data
    row_data = {}

    # Filter for intubation and extubation
    intubation_filtered = group[group['intubationtime'] == intubation_time]
    extubation_filtered = group[group['extubationtime'] == extubation_time]

    # Check if exact match exists
    if not intubation_filtered.empty and not extubation_filtered.empty:
        row_data = intubation_filtered.merge(extubation_filtered, on=['subject_id', 'hadm_id'], how='outer').iloc[0].to_dict()
    else:
        # Handle intubation data
        if not intubation_filtered.empty:
            intubation_data = intubation_filtered.iloc[0]
            for col in ['subject_id', 'hadm_id', 'int_stayid', 'admittime', 'intubationtime', 'int_itemid', 'int_weight', 'dischtime', 'deathtime']:
                row_data[col] = intubation_data[col]
            for col in ['ext_stayid', 'extubationtime', 'ext_itemid', 'ext_weight', 'extubationcause']:
                row_data[col] = None

        # Handle extubation data
        if not extubation_filtered.empty:
            extubation_data = extubation_filtered.iloc[0]
            for col in ['subject_id', 'hadm_id', 'ext_stayid', 'extubationtime', 'ext_itemid', 'ext_weight', 'extubationcause']:
                row_data[col] = extubation_data[col]
            for col in ['int_stayid', 'admittime', 'intubationtime', 'int_itemid', 'int_weight']:
                row_data.setdefault(col, None)  # Set as None if not already set by intubation data

    return row_data


## DATAFRAME 단위로 적용되는 함수들
## 메인 코드 ##
def process_intubation_data_grouped(df):
    """
    subject_id와 hadm_id 별로 그룹화된 intubation/extubation 데이터를 처리합니다.

    Args:
    df (pd.DataFrame): 데이터가 포함된 DataFrame입니다.

    Returns:
    pd.DataFrame: 처리된 DataFrame입니다.
    """

    modified_df = df.copy()  # 원본 DataFrame의 복사본 생성
    all_log_messages = []

    # subject_id와 hadm_id로 그룹화

    for (subject_id, hadm_id), group in df.groupby(['subject_id', 'hadm_id']):
        # 해당 그룹(hadm_id)에 intubation 이벤트가 하나만 있는 경우
        if group['intubationtime'].count() == 1:
            modified_df, log_messages = handle_single_event_cases(group, modified_df)
            all_log_messages.extend(log_messages)

        # 해당 그룹(hadm_id) 안에 여러개의 intubation 이벤트가 있는 경우
        else:
            pair_logs = []
            # 1. intubation/extubation 페어링
            pair_results = {}  # 해당 그룹의 삽관/발관 페어링 결과 저장소

            unique_intubations = list(group['intubationtime'].dropna().unique())
            unique_extubations = list(group['extubationtime'].dropna().unique())

            pairs = find_pairs(unique_intubations, unique_extubations)
            pair_results[(subject_id, hadm_id)] = pairs  # 페어링 결과 저장

            for pair in pairs:
                pair_logs.append(f"sub_id:{subject_id}, hadm_id:{hadm_id}: intubation/extubation pair: {pair}")
            
            # 2. 페어링된 값을 DataFrame에 적용 (작업중)
            extracted_data = []  # Extracted data from each pair
            for int_time, ext_time in pairs:
                data = extract_row_data(group, int_time, ext_time)
                extracted_data.append(data)


    return modified_df, all_log_messages, pair_logs, pair_results, extracted_data  # log_messages


### TEST CODE (아래)

In [129]:
test_subs = [10098215, 10004401, 10007818, 13033327, \
             1513422667, 13515178, 18344437, 19997367, \
                13696494, 12495515, 10233597]

intubation_extubation_test = intubation_extubation[intubation_extubation.subject_id.isin(test_subs)]
intubation_extubation_test.subject_id.unique()

array([10004401, 10007818, 10098215, 10233597, 12495515, 13033327,
       13515178, 13696494, 18344437, 19997367])

In [122]:
intubation_extubation_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 2 to 10924
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   subject_id       87 non-null     int64         
 1   hadm_id          87 non-null     int64         
 2   int_stayid       87 non-null     int64         
 3   admittime        87 non-null     object        
 4   intubationtime   87 non-null     datetime64[ns]
 5   int_itemid       87 non-null     int64         
 6   int_weight       87 non-null     float64       
 7   ext_stayid       85 non-null     float64       
 8   extubationtime   85 non-null     datetime64[ns]
 9   ext_itemid       85 non-null     float64       
 10  ext_weight       85 non-null     float64       
 11  extubationcause  85 non-null     object        
 12  dischtime        87 non-null     object        
 13  deathtime        11 non-null     object        
dtypes: datetime64[ns](2), float64(4), int64(4), ob

In [123]:
intubation_extubation_test

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime
2,10004401,27939719,31202136,2144-04-11 03:31:00,2144-04-11 05:03:00,224385,120.0,,NaT,,,,2144-04-13 17:31:00,
3,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-27 19:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,
4,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-30 13:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,
6,10007818,22987108,32359580,2146-06-10 16:37:00,2146-06-22 12:49:00,224385,86.2,,NaT,,,,2146-07-12 00:00:00,2146-07-12 20:50:00
69,10098215,20652197,37703075,2118-08-14 20:31:00,2118-09-02 07:45:00,224385,66.0,37703075.0,2118-09-05 11:20:00,227194.0,66.0,Planned Extubation,2118-09-18 15:15:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10920,19997367,20617667,35616526,2126-04-20 07:15:00,2126-04-26 20:14:00,224385,59.0,35616526.0,2126-05-02 16:30:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,
10921,19997367,20617667,35616526,2126-04-20 07:15:00,2126-04-26 20:14:00,224385,59.0,35616526.0,2126-05-08 17:29:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,
10922,19997367,20617667,35616526,2126-04-20 07:15:00,2126-05-05 18:20:00,224385,59.0,35616526.0,2126-04-22 08:23:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,
10923,19997367,20617667,35616526,2126-04-20 07:15:00,2126-05-05 18:20:00,224385,59.0,35616526.0,2126-05-02 16:30:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,


In [131]:
processed_df_grouped, all_log_messages, pair_logs, pair_results, extracted_data = process_intubation_data_grouped(intubation_extubation_test)

In [125]:
extracted_data

[{'subject_id': 19997367,
  'hadm_id': 20617667,
  'int_stayid_x': 35616526,
  'admittime_x': '2126-04-20 07:15:00',
  'intubationtime_x': Timestamp('2126-04-26 20:14:00'),
  'int_itemid_x': 224385,
  'int_weight_x': 59.0,
  'ext_stayid_x': 35616526.0,
  'extubationtime_x': Timestamp('2126-04-22 08:23:00'),
  'ext_itemid_x': 227194.0,
  'ext_weight_x': 59.0,
  'extubationcause_x': 'Planned Extubation',
  'dischtime_x': '2126-05-19 14:15:00',
  'deathtime_x': nan,
  'int_stayid_y': 35616526,
  'admittime_y': '2126-04-20 07:15:00',
  'intubationtime_y': Timestamp('2126-04-26 20:14:00'),
  'int_itemid_y': 224385,
  'int_weight_y': 59.0,
  'ext_stayid_y': 35616526.0,
  'extubationtime_y': Timestamp('2126-05-02 16:30:00'),
  'ext_itemid_y': 227194.0,
  'ext_weight_y': 59.0,
  'extubationcause_y': 'Planned Extubation',
  'dischtime_y': '2126-05-19 14:15:00',
  'deathtime_y': nan},
 {'subject_id': 19997367,
  'hadm_id': 20617667,
  'int_stayid_x': 35616526,
  'admittime_x': '2126-04-20 07:15:

In [110]:
len(pair_logs)

27

In [98]:
pair_results.items()

dict_items([((19997367, 20617667), [(Timestamp('2126-04-26 20:14:00'), Timestamp('2126-05-02 16:30:00')), (Timestamp('2126-05-05 18:20:00'), Timestamp('2126-05-08 17:29:00'))])])

In [103]:
intubation_extubation_test[(intubation_extubation_test.subject_id == 10098215) & (intubation_extubation_test.hadm_id == 22394571)]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,int_ext_time,dischtime,deathtime
71,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,27 days 21:31:00,2118-07-16 12:15:00,
72,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,29 days 16:53:00,2118-07-16 12:15:00,
73,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,32301420.0,2118-06-21 03:30:00,225468.0,48.7,Unplanned Extuabtion (patient-initiated),42 days 08:28:00,2118-07-16 12:15:00,
74,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,32301420.0,2118-06-27 12:45:00,227194.0,48.7,Planned Extubation,48 days 17:43:00,2118-07-16 12:15:00,
75,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,2 days 19:43:00,2118-07-16 12:15:00,
76,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,4 days 15:05:00,2118-07-16 12:15:00,
77,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,32301420.0,2118-06-21 03:30:00,225468.0,48.7,Unplanned Extuabtion (patient-initiated),17 days 06:40:00,2118-07-16 12:15:00,
78,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,32301420.0,2118-06-27 12:45:00,227194.0,48.7,Planned Extubation,23 days 15:55:00,2118-07-16 12:15:00,
79,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-07 01:40:00,224385,55.4,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,-1 days +14:53:00,2118-07-16 12:15:00,
80,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-07 01:40:00,224385,55.4,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,1 days 10:15:00,2118-07-16 12:15:00,


In [92]:
pair_logs

["sub_id:10004401, hadm_id:29988601: intubation/extubation pair: (Timestamp('2144-01-27 19:00:00'), Timestamp('2144-01-30 12:30:00'))",
 "sub_id:10004401, hadm_id:29988601: intubation/extubation pair: (Timestamp('2144-01-30 13:00:00'), None)",
 "sub_id:10098215, hadm_id:22394571: intubation/extubation pair: (Timestamp('2118-05-09 19:02:00'), None)",
 "sub_id:10098215, hadm_id:22394571: intubation/extubation pair: (Timestamp('2118-06-03 20:50:00'), Timestamp('2118-06-06 16:33:00'))",
 "sub_id:10098215, hadm_id:22394571: intubation/extubation pair: (Timestamp('2118-06-07 01:40:00'), Timestamp('2118-06-08 11:55:00'))",
 "sub_id:10098215, hadm_id:22394571: intubation/extubation pair: (Timestamp('2118-06-18 01:48:00'), Timestamp('2118-06-21 03:30:00'))",
 "sub_id:10098215, hadm_id:22394571: intubation/extubation pair: (Timestamp('2118-06-21 08:28:00'), Timestamp('2118-06-27 12:45:00'))",
 "sub_id:10233597, hadm_id:27468267: intubation/extubation pair: (Timestamp('2187-12-14 20:56:00'), Time

In [126]:
# 전체 데이터로 확인
processed_df_full, pair_logs2, pair_results2, extracted_data2 = process_intubation_data_grouped(intubation_extubation)
pair_logs2

["sub_id:10004401, hadm_id:29988601: intubation/extubation pair: (Timestamp('2144-01-27 19:00:00'), Timestamp('2144-01-30 12:30:00'))",
 "sub_id:10004401, hadm_id:29988601: intubation/extubation pair: (Timestamp('2144-01-30 13:00:00'), None)",
 "sub_id:10020740, hadm_id:23831430: intubation/extubation pair: (Timestamp('2150-03-27 07:44:00'), Timestamp('2150-03-27 16:33:00'))",
 "sub_id:10020740, hadm_id:23831430: intubation/extubation pair: (Timestamp('2150-03-30 08:20:00'), Timestamp('2150-04-02 01:09:00'))",
 "sub_id:10024982, hadm_id:25154057: intubation/extubation pair: (Timestamp('2203-10-02 11:00:00'), Timestamp('2203-10-09 14:50:00'))",
 "sub_id:10027602, hadm_id:28166872: intubation/extubation pair: (Timestamp('2201-11-07 15:00:00'), Timestamp('2201-11-09 12:45:00'))",
 "sub_id:10032381, hadm_id:20176432: intubation/extubation pair: (Timestamp('2115-07-06 02:15:00'), Timestamp('2115-07-15 11:00:00'))",
 "sub_id:10032381, hadm_id:20176432: intubation/extubation pair: (Timestamp(

In [127]:
extracted_data2

[{'subject_id': 19997367,
  'hadm_id': 20617667,
  'int_stayid_x': 35616526,
  'admittime_x': '2126-04-20 07:15:00',
  'intubationtime_x': Timestamp('2126-04-26 20:14:00'),
  'int_itemid_x': 224385,
  'int_weight_x': 59.0,
  'ext_stayid_x': 35616526.0,
  'extubationtime_x': Timestamp('2126-04-22 08:23:00'),
  'ext_itemid_x': 227194.0,
  'ext_weight_x': 59.0,
  'extubationcause_x': 'Planned Extubation',
  'dischtime_x': '2126-05-19 14:15:00',
  'deathtime_x': nan,
  'int_stayid_y': 35616526,
  'admittime_y': '2126-04-20 07:15:00',
  'intubationtime_y': Timestamp('2126-04-26 20:14:00'),
  'int_itemid_y': 224385,
  'int_weight_y': 59.0,
  'ext_stayid_y': 35616526.0,
  'extubationtime_y': Timestamp('2126-05-02 16:30:00'),
  'ext_itemid_y': 227194.0,
  'ext_weight_y': 59.0,
  'extubationcause_y': 'Planned Extubation',
  'dischtime_y': '2126-05-19 14:15:00',
  'deathtime_y': nan},
 {'subject_id': 19997367,
  'hadm_id': 20617667,
  'int_stayid_x': 35616526,
  'admittime_x': '2126-04-20 07:15:

time pairing test

In [95]:
pair_results2

{(19997367,
  20617667): [(Timestamp('2126-04-26 20:14:00'),
   Timestamp('2126-05-02 16:30:00')), (Timestamp('2126-05-05 18:20:00'),
   Timestamp('2126-05-08 17:29:00'))]}

In [90]:
pair_logs2

["sub_id:10004401, hadm_id:29988601: intubation/extubation pair: (Timestamp('2144-01-27 19:00:00'), Timestamp('2144-01-30 12:30:00'))",
 "sub_id:10004401, hadm_id:29988601: intubation/extubation pair: (Timestamp('2144-01-30 13:00:00'), None)",
 "sub_id:10020740, hadm_id:23831430: intubation/extubation pair: (Timestamp('2150-03-27 07:44:00'), Timestamp('2150-03-27 16:33:00'))",
 "sub_id:10020740, hadm_id:23831430: intubation/extubation pair: (Timestamp('2150-03-30 08:20:00'), Timestamp('2150-04-02 01:09:00'))",
 "sub_id:10024982, hadm_id:25154057: intubation/extubation pair: (Timestamp('2203-10-02 11:00:00'), Timestamp('2203-10-09 14:50:00'))",
 "sub_id:10027602, hadm_id:28166872: intubation/extubation pair: (Timestamp('2201-11-07 15:00:00'), Timestamp('2201-11-09 12:45:00'))",
 "sub_id:10032381, hadm_id:20176432: intubation/extubation pair: (Timestamp('2115-07-06 02:15:00'), Timestamp('2115-07-15 11:00:00'))",
 "sub_id:10032381, hadm_id:20176432: intubation/extubation pair: (Timestamp(

Single value missing data

In [132]:
intubation_extubation_test[intubation_extubation_test.subject_id.isin([10004401, 10007818])]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime
2,10004401,27939719,31202136,2144-04-11 03:31:00,2144-04-11 05:03:00,224385,120.0,,NaT,,,,2144-04-13 17:31:00,
3,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-27 19:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,
4,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-30 13:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,
6,10007818,22987108,32359580,2146-06-10 16:37:00,2146-06-22 12:49:00,224385,86.2,,NaT,,,,2146-07-12 00:00:00,2146-07-12 20:50:00


In [133]:
processed_df_grouped[processed_df_grouped.subject_id.isin([10004401, 10007818])]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime
2,10004401,27939719,31202136,2144-04-11 03:31:00,2144-04-11 05:03:00,224385,120.0,,2144-04-13 17:31:00,,,,2144-04-13 17:31:00,
3,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-27 19:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,
4,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-30 13:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,
6,10007818,22987108,32359580,2146-06-10 16:37:00,2146-06-22 12:49:00,224385,86.2,,2146-07-12 20:50:00,,,,2146-07-12 00:00:00,2146-07-12 20:50:00


------

In [96]:
test_subject = intubation_extubation[intubation_extubation.subject_id == 10098215]   # extract single subject
test_subject_sorted = test_subject.sort_values(by=['intubationtime'], ascending=[True])   # sort data in ascending order of intubationtime
test_subject_sorted

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,int_ext_time,dischtime,deathtime
91,10098215,29752040,31800901,2117-10-25 20:40:00,2117-10-26 10:53:00,224385,56.5,31800901.0,2117-10-29 09:25:00,227194.0,56.5,Planned Extubation,2 days 22:32:00,2117-11-03 16:30:00,
70,10098215,21701015,30598744,2118-01-10 03:42:00,2118-01-10 13:09:00,224385,57.0,30598744.0,2118-01-11 09:55:00,227194.0,57.0,Planned Extubation,0 days 20:46:00,2118-02-13 19:00:00,
71,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,27 days 21:31:00,2118-07-16 12:15:00,
72,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,29 days 16:53:00,2118-07-16 12:15:00,
73,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,32301420.0,2118-06-21 03:30:00,225468.0,48.7,Unplanned Extuabtion (patient-initiated),42 days 08:28:00,2118-07-16 12:15:00,
74,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,32301420.0,2118-06-27 12:45:00,227194.0,48.7,Planned Extubation,48 days 17:43:00,2118-07-16 12:15:00,
75,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,2 days 19:43:00,2118-07-16 12:15:00,
76,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,4 days 15:05:00,2118-07-16 12:15:00,
77,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,32301420.0,2118-06-21 03:30:00,225468.0,48.7,Unplanned Extuabtion (patient-initiated),17 days 06:40:00,2118-07-16 12:15:00,
78,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,32301420.0,2118-06-27 12:45:00,227194.0,48.7,Planned Extubation,23 days 15:55:00,2118-07-16 12:15:00,


#### 1) hadm_id 별로 몇번의 intubation event가 있었는지 카운트

In [100]:
# Count the number of intubationtime events for each hadm_id
int_ext_counts = test_subject_sorted.groupby('hadm_id')[['intubationtime', 'extubationtime']].count()

# Display the counts
int_ext_counts.reset_index()


Unnamed: 0,hadm_id,intubationtime,extubationtime
0,20652197,1,1
1,21701015,1,1
2,22394571,20,20
3,29752040,1,1


subject-level 관련 함수

In [135]:
subject_id = 10098215
test_subject_unique_data = intubation_extubation[intubation_extubation.subject_id == subject_id]   # extract single subject
test_results = process_intubation_data_grouped(test_subject_unique_data)


고유한 intubation 이벤트 수: 5
고유한 extubation 이벤트 수: 4
extubation 결측치 수: 0
--고유한 Intubation 이벤트--
<DatetimeArray>
['2118-05-09 19:02:00', '2118-06-03 20:50:00', '2118-06-07 01:40:00',
 '2118-06-18 01:48:00', '2118-06-21 08:28:00']
Length: 5, dtype: datetime64[ns]
--고유한 Extubation 이벤트--
<DatetimeArray>
['2118-06-06 16:33:00', '2118-06-08 11:55:00', '2118-06-21 03:30:00',
 '2118-06-27 12:45:00']
Length: 4, dtype: datetime64[ns]
list print test:
2118-05-09 19:02:00


In [132]:
test_subject_unique_data[test_subject_unique_data.hadm_id == 22394571]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,int_ext_time,dischtime,deathtime
71,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,27 days 21:31:00,2118-07-16 12:15:00,
72,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,29 days 16:53:00,2118-07-16 12:15:00,
73,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,32301420.0,2118-06-21 03:30:00,225468.0,48.7,Unplanned Extuabtion (patient-initiated),42 days 08:28:00,2118-07-16 12:15:00,
74,10098215,22394571,32817342,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,32301420.0,2118-06-27 12:45:00,227194.0,48.7,Planned Extubation,48 days 17:43:00,2118-07-16 12:15:00,
75,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,2 days 19:43:00,2118-07-16 12:15:00,
76,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,4 days 15:05:00,2118-07-16 12:15:00,
77,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,32301420.0,2118-06-21 03:30:00,225468.0,48.7,Unplanned Extuabtion (patient-initiated),17 days 06:40:00,2118-07-16 12:15:00,
78,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,32301420.0,2118-06-27 12:45:00,227194.0,48.7,Planned Extubation,23 days 15:55:00,2118-07-16 12:15:00,
79,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-07 01:40:00,224385,55.4,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,-1 days +14:53:00,2118-07-16 12:15:00,
80,10098215,22394571,36018186,2118-05-03 23:07:00,2118-06-07 01:40:00,224385,55.4,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,1 days 10:15:00,2118-07-16 12:15:00,


In [None]:

# DataFrame 처리
processed_df_grouped, logs_grouped = process_intubation_data_grouped(test_subject_sorted)

# 처리된 데이터와 로그 메시지 출력
processed_df_grouped.head(), logs_grouped


In [108]:
## 결측치 채우기 테스트
subject_id = 10004401  # extubation 결측치 존재

test_subject_missing_data = intubation_extubation[intubation_extubation.subject_id == subject_id]   # extract single subject
test_subject_missing_data_sorted = test_subject_missing_data.sort_values(by=['intubationtime'], ascending=[True])   # sort data in ascending order of intubationtime
test_subject_missing_data_sorted





Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,int_ext_time,dischtime,deathtime
3,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-27 19:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2 days 17:30:00,2144-02-06 11:45:00,
4,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-30 13:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,-1 days +23:30:00,2144-02-06 11:45:00,
2,10004401,27939719,31202136,2144-04-11 03:31:00,2144-04-11 05:03:00,224385,120.0,,NaT,,,,NaT,2144-04-13 17:31:00,


In [110]:
# DataFrame 처리
processed_sub10004401, logs_grouped = process_intubation_data_grouped(test_subject_missing_data_sorted)

# 처리된 데이터와 로그 메시지 출력
processed_sub10004401

hadm_id: 27939719 처리 중...
hadm_id: 29988601 처리 중...
하나 이상의 대체 이벤트가 있음. 계속 진행...


Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,int_ext_time,dischtime,deathtime
3,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-27 19:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2 days 17:30:00,2144-02-06 11:45:00,
4,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-30 13:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,-1 days +23:30:00,2144-02-06 11:45:00,
2,10004401,27939719,31202136,2144-04-11 03:31:00,2144-04-11 05:03:00,224385,120.0,,2144-04-13 17:31:00,,,,NaT,2144-04-13 17:31:00,


In [111]:
## 결측치 채우기 테스트
subject_id = 10007818  # extubation 결측치 존재

test_subject_missing_data = intubation_extubation[intubation_extubation.subject_id == subject_id]   # extract single subject
test_subject_missing_data_sorted = test_subject_missing_data.sort_values(by=['intubationtime'], ascending=[True])   # sort data in ascending order of intubationtime
test_subject_missing_data_sorted





Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,int_ext_time,dischtime,deathtime
6,10007818,22987108,32359580,2146-06-10 16:37:00,2146-06-22 12:49:00,224385,86.2,,NaT,,,,NaT,2146-07-12 00:00:00,2146-07-12 20:50:00


In [112]:
# DataFrame 처리
processed_sub10004401, logs_grouped = process_intubation_data_grouped(test_subject_missing_data_sorted)

# 처리된 데이터와 로그 메시지 출력
processed_sub10004401

hadm_id: 22987108 처리 중...


Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,int_ext_time,dischtime,deathtime
6,10007818,22987108,32359580,2146-06-10 16:37:00,2146-06-22 12:49:00,224385,86.2,,2146-07-12 20:50:00,,,,NaT,2146-07-12 00:00:00,2146-07-12 20:50:00


In [None]:
# 정렬알고리즘 테스트 예시
# start_times = [list of start times]
# end_times = [list of end times]
# sorted_pairs = find_pairs(start_times, end_times)

In [89]:
# Group the data by hadm_id and count duplicate values for intubationtime and extubationtime
intubation_duplicates = test_subject_sorted.groupby('hadm_id')['intubationtime'].value_counts()
extubation_duplicates = test_subject_sorted.groupby('hadm_id')['extubationtime'].value_counts()

# Prepare a DataFrame to display the results
duplicate_counts = pd.DataFrame({
    'Intubation Duplicates': intubation_duplicates,
    'Extubation Duplicates': extubation_duplicates
}).fillna(0).astype(int)

# Display the DataFrame
duplicate_counts.reset_index()

Unnamed: 0,hadm_id,level_1,Intubation Duplicates,Extubation Duplicates
0,20652197,2118-09-02 07:45:00,1,0
1,20652197,2118-09-05 11:20:00,0,1
2,21701015,2118-01-10 13:09:00,1,0
3,21701015,2118-01-11 09:55:00,0,1
4,22394571,2118-05-09 19:02:00,4,0
5,22394571,2118-06-03 20:50:00,4,0
6,22394571,2118-06-06 16:33:00,0,5
7,22394571,2118-06-07 01:40:00,4,0
8,22394571,2118-06-08 11:55:00,0,5
9,22394571,2118-06-18 01:48:00,4,0


In [80]:
test_subject.shape

(12, 14)

In [77]:
test_subject2 = intubation_extubation_stay[intubation_extubation_stay.subject_id == 13033327]
test_subject2

Unnamed: 0,subject_id,hadm_id,stay_id,admittime,intubationtime,int_itemid,int_weight,extubationtime,ext_itemid,ext_weight,extubationcause,int_ext_time,dischtime,deathtime
3032,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-05 17:41:00,224385,71.0,2146-01-11 15:42:00,227194.0,71.0,Planned Extubation,5 days 22:01:00,2146-02-15 17:10:00,
3033,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-05 17:41:00,224385,71.0,2146-01-16 16:00:00,227194.0,71.0,Planned Extubation,10 days 22:19:00,2146-02-15 17:10:00,
3034,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-05 17:41:00,224385,71.0,2146-01-18 13:05:00,227194.0,71.0,Planned Extubation,12 days 19:24:00,2146-02-15 17:10:00,
3035,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-05 17:41:00,224385,71.0,2146-01-23 09:25:00,227194.0,71.0,Planned Extubation,17 days 15:44:00,2146-02-15 17:10:00,
3036,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-05 17:41:00,224385,71.0,2146-01-29 10:00:00,225477.0,71.0,Unplanned Extuabtion (non-patient initiated),23 days 16:19:00,2146-02-15 17:10:00,
3037,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-20 15:12:00,224385,71.0,2146-01-11 15:42:00,227194.0,71.0,Planned Extubation,-9 days +00:30:00,2146-02-15 17:10:00,
3038,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-20 15:12:00,224385,71.0,2146-01-16 16:00:00,227194.0,71.0,Planned Extubation,-4 days +00:48:00,2146-02-15 17:10:00,
3039,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-20 15:12:00,224385,71.0,2146-01-18 13:05:00,227194.0,71.0,Planned Extubation,-3 days +21:53:00,2146-02-15 17:10:00,
3040,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-20 15:12:00,224385,71.0,2146-01-23 09:25:00,227194.0,71.0,Planned Extubation,2 days 18:13:00,2146-02-15 17:10:00,
3041,13033327,21093920,39867250,2146-01-05 01:07:00,2146-01-20 15:12:00,224385,71.0,2146-01-29 10:00:00,225477.0,71.0,Unplanned Extuabtion (non-patient initiated),8 days 18:48:00,2146-02-15 17:10:00,


In [None]:
su

In [44]:
int_dup_count = intubation_extubation.groupby(['subject_id', 'hadm_id', 'int_stayid'])['intubationtime'].nunique().reset_index()
int_dup_count = int_dup_count.rename(columns={'intubationtime': 'unique_intubation_count'})

int_dup_count.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7330 entries, 0 to 7329
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   subject_id               7330 non-null   int64  
 1   hadm_id                  7330 non-null   int64  
 2   int_stayid               7330 non-null   float64
 3   unique_intubation_count  7330 non-null   int64  
dtypes: float64(1), int64(3)
memory usage: 229.2 KB


In [58]:
int_dup_count[int_dup_count.unique_intubation_count > 1]

Unnamed: 0,subject_id,hadm_id,int_stayid,unique_intubation_count
3,10004401,29988601,32773003.0,2
22,10032381,20176432,34622731.0,2
29,10039708,28258130,33281088.0,2
51,10090454,29525590,38669202.0,2
56,10098215,22394571,32301420.0,2
...,...,...,...,...
7263,19913577,20355379,37544707.0,3
7273,19928728,21394753,32643307.0,2
7287,19949739,25237531,32028235.0,2
7294,19960105,27292691,31512322.0,2


In [34]:
intubation_extubation[intubation_extubation.hadm_id == 22394571]

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,int_ext_time,dischtime,deathtime
71,10098215,22394571,32817342.0,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,27 days 21:31:00,2118-07-16 12:15:00,
72,10098215,22394571,32817342.0,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,29 days 16:53:00,2118-07-16 12:15:00,
73,10098215,22394571,32817342.0,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,32301420.0,2118-06-21 03:30:00,225468.0,48.7,Unplanned Extuabtion (patient-initiated),42 days 08:28:00,2118-07-16 12:15:00,
74,10098215,22394571,32817342.0,2118-05-03 23:07:00,2118-05-09 19:02:00,224385,60.0,32301420.0,2118-06-27 12:45:00,227194.0,48.7,Planned Extubation,48 days 17:43:00,2118-07-16 12:15:00,
75,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,2 days 19:43:00,2118-07-16 12:15:00,
76,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,4 days 15:05:00,2118-07-16 12:15:00,
77,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,32301420.0,2118-06-21 03:30:00,225468.0,48.7,Unplanned Extuabtion (patient-initiated),17 days 06:40:00,2118-07-16 12:15:00,
78,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-03 20:50:00,224385,55.4,32301420.0,2118-06-27 12:45:00,227194.0,48.7,Planned Extubation,23 days 15:55:00,2118-07-16 12:15:00,
79,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-07 01:40:00,224385,55.4,36018186.0,2118-06-06 16:33:00,227194.0,55.4,Planned Extubation,-1 days +14:53:00,2118-07-16 12:15:00,
80,10098215,22394571,36018186.0,2118-05-03 23:07:00,2118-06-07 01:40:00,224385,55.4,36018186.0,2118-06-08 11:55:00,227194.0,55.4,Planned Extubation,1 days 10:15:00,2118-07-16 12:15:00,


In [72]:
def calculate_mismatches(df):
    """
    이 함수는 한번의 입원 중 복수의 삽관 및 발관 이벤트가 있는 경우를 계산합니다.
    데이터를 subject_id, hadm_id, stay_id로 그룹화하고 intubationtime 및 extubationtime에 대한 고유 값의 수를 계산한 후,
    이러한 시간에 대해 여러 개의 고유 값이 있는 그룹을 식별합니다.
    """
    # subject_id, hadm_id, stay_id로 그룹화
    grouped = df.groupby(['subject_id', 'hadm_id', 'stay_id'])

    # intubationtime 및 extubationtime에 대한 고유 값 수 계산
    unique_counts = grouped.agg({'intubationtime': pd.Series.nunique, 'extubationtime': pd.Series.nunique})

    # intubationtime 및 extubationtime에 여러 개의 고유 값이 있는 그룹 필터링
    potential_mismatches = unique_counts[(unique_counts['intubationtime'] > 1) | (unique_counts['extubationtime'] > 1)]

    return potential_mismatches.reset_index()

# 데이터프레임에 함수 적용
mismatch_stats = calculate_mismatches(intubation_extubation_stay)
mismatch_stats.head()


Unnamed: 0,subject_id,hadm_id,stay_id,intubationtime,extubationtime
0,10004401,29988601,32773003,2,1
1,10024982,25154057,37919901,1,2
2,10027602,28166872,32391858,1,2
3,10032381,20176432,34622731,2,1
4,10039708,28258130,33281088,2,2


In [73]:
mismatch_stats[mismatch_stats['subject_id'] == 13033327]

Unnamed: 0,subject_id,hadm_id,stay_id,intubationtime,extubationtime
400,13033327,21093920,39867250,4,5


In [74]:
mismatch_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1395 entries, 0 to 1394
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   subject_id      1395 non-null   int64
 1   hadm_id         1395 non-null   int64
 2   stay_id         1395 non-null   int64
 3   intubationtime  1395 non-null   int64
 4   extubationtime  1395 non-null   int64
dtypes: int64(5)
memory usage: 54.6 KB


### 2. Extubation 결측치 처리하기

In [24]:
# 결측치 확인
extubation_missing_count = intubation_extubation.extubationtime.isna().sum()
print(f'Extubation 결측치 개수: {extubation_missing_count} / {len(intubation_extubation)} ({round(extubation_missing_count/len(intubation_extubation) * 100, 2)}%)') 

Extubation 결측치 개수: 1541 / 10928 (14.1%)


In [5]:
#데이터를 보면, 사망일과 퇴원일이 같은경우/다른경우가 있고, 사망은 없고 퇴원한 경우가 있음. 사망한 경우 사망하면 기기를 제거하므로 deathtime 을 쓰는 것이 적절할 것임. discharge를 할 경우 관을 유지하나? 보통 발관할 것 같은데..? 그래도 해당 병원에
#이 경우, extubationcause 에 deathtime 으로 적어서 deathtime 을 사용했음을 보여주려고 함.
#사망은 869건이니까 1840-869 = 971건이 퇴원한 경우임. 이 경우는 discharge time을 사용할지? 흠 역시 dischargetime 을 사용하는것이 적절할 것같음, 이 경우에도 extubationcause에 dischargetime 으로 적어야 할 것 같음
#그럼 이경우 코드를 어떻게 짤 수 있냐? 데이터 둘다 있으면 deathtime 에게 우선 순위를 주겠음(사유, 보통 더 정확함)



In [6]:
# intcount가 1인 경우, 다음 로직을 적용하여 채우도록하기
# #deathtime과 dischtime 둘 다 있는 경우, deathtime을 사용하여 extubationtime을 업데이트하고 extubationcause에 "deathtime" 문자열을 입력합니다.
# #deathtime이 있는 경우 (이전의 로직과 겹치지 않는다는 가정하에) 역시 deathtime을 사용하여 extubationtime을 업데이트하고 extubationcause에 "deathtime" 문자열을 입력합니다.
# #deathtime이 없고 dischtime만 있는 경우, dischtime을 사용하여 extubationtime을 업데이트하고 extubationcause에 "dischargetime" 문자열을 입력합니다.
# # deathtime과 dischtime 둘 다 있는 경우
# mask_both = (~check_extubation2['deathtime'].isna()) & (~check_extubation2['dischtime'].isna())
# check_extubation2.loc[mask_both, 'extubationtime'] = check_extubation2.loc[mask_both, 'deathtime']
# check_extubation2.loc[mask_both, 'extubationcause'] = 'deathtime'

# # deathtime만 있는 경우
# mask_death = (~check_extubation2['deathtime'].isna()) & (check_extubation2['dischtime'].isna())
# check_extubation2.loc[mask_death, 'extubationtime'] = check_extubation2.loc[mask_death, 'deathtime']
# check_extubation2.loc[mask_death, 'extubationcause'] = 'deathtime'

# # dischtime만 있는 경우
# mask_discharge = (check_extubation2['deathtime'].isna()) & (~check_extubation2['dischtime'].isna())
# check_extubation2.loc[mask_discharge, 'extubationtime'] = check_extubation2.loc[mask_discharge, 'dischtime']
# check_extubation2.loc[mask_discharge, 'extubationcause'] = 'dischargetime'

In [7]:
# #deathtime과 dischtime 둘 다 있는 경우, deathtime을 사용하여 extubationtime을 업데이트하고 extubationcause에 "deathtime" 문자열을 입력합니다.
# #deathtime이 있는 경우 (이전의 로직과 겹치지 않는다는 가정하에) 역시 deathtime을 사용하여 extubationtime을 업데이트하고 extubationcause에 "deathtime" 문자열을 입력합니다.
# #deathtime이 없고 dischtime만 있는 경우, dischtime을 사용하여 extubationtime을 업데이트하고 extubationcause에 "dischargetime" 문자열을 입력합니다.
# # deathtime과 dischtime 둘 다 있는 경우
# mask_both = (~check_extubation2['deathtime'].isna()) & (~check_extubation2['dischtime'].isna())
# check_extubation2.loc[mask_both, 'extubationtime'] = check_extubation2.loc[mask_both, 'deathtime']
# check_extubation2.loc[mask_both, 'extubationcause'] = 'deathtime'

# # deathtime만 있는 경우
# mask_death = (~check_extubation2['deathtime'].isna()) & (check_extubation2['dischtime'].isna())
# check_extubation2.loc[mask_death, 'extubationtime'] = check_extubation2.loc[mask_death, 'deathtime']
# check_extubation2.loc[mask_death, 'extubationcause'] = 'deathtime'

# # dischtime만 있는 경우
# mask_discharge = (check_extubation2['deathtime'].isna()) & (~check_extubation2['dischtime'].isna())
# check_extubation2.loc[mask_discharge, 'extubationtime'] = check_extubation2.loc[mask_discharge, 'dischtime']
# check_extubation2.loc[mask_discharge, 'extubationcause'] = 'dischargetime'


In [8]:
# #주의! extubationtime이 이미 데이터가 있는 경우 replace하면 안됨!
# #deathtime과 dischtime 둘 다 있는 경우, deathtime을 사용하여 extubationtime을 업데이트하고 extubationcause에 "deathtime" 문자열을 입력합니다.
# #deathtime이 있는 경우 (이전의 로직과 겹치지 않는다는 가정하에) 역시 deathtime을 사용하여 extubationtime을 업데이트하고 extubationcause에 "deathtime" 문자열을 입력합니다.
# #deathtime이 없고 dischtime만 있는 경우, dischtime을 사용하여 extubationtime을 업데이트하고 extubationcause에 "dischargetime" 문자열을 입력합니다.
# # deathtime과 dischtime 둘 다 있는 경우이면서 extubationtime이 NaN인 경우
# mask_both = (~intubation_extubation2['deathtime'].isna()) & (~intubation_extubation2['dischtime'].isna()) & (intubation_extubation2['extubationtime'].isna())
# intubation_extubation2.loc[mask_both, 'extubationtime'] = intubation_extubation2.loc[mask_both, 'deathtime']
# intubation_extubation2.loc[mask_both, 'extubationcause'] = 'deathtime'
# #
# # deathtime만 있는 경우이면서 extubationtime이 NaN인 경우
# mask_death = (~intubation_extubation2['deathtime'].isna()) & (intubation_extubation2['dischtime'].isna()) & (intubation_extubation2['extubationtime'].isna())
# intubation_extubation2.loc[mask_death, 'extubationtime'] = intubation_extubation2.loc[mask_death, 'deathtime']
# intubation_extubation2.loc[mask_death, 'extubationcause'] = 'deathtime'

# # dischtime만 있는 경우이면서 extubationtime이 NaN인 경우
# mask_discharge = (intubation_extubation2['deathtime'].isna()) & (~intubation_extubation2['dischtime'].isna()) & (intubation_extubation2['extubationtime'].isna())
# intubation_extubation2.loc[mask_discharge, 'extubationtime'] = intubation_extubation2.loc[mask_discharge, 'dischtime']
# intubation_extubation2.loc[mask_discharge, 'extubationcause'] = 'dischargetime'


In [10]:

# #여기서 intcount는 해당 입원에 총 몇번의 intubation을 했는지를 의미한다.
# #그러므로 같은 hadm_id에서 intcount와 seq가 같다면, dichtime 또는 deathtime 으로 extubationtime을 대체할 수 있다.
# #그외는 마지막 seq에 도달하기 전이므로, 다음 seq의 intubationtime 을 이전 seq의 extubationtime으로 대체한다.

# df = intubation_extubation3


# def replace_extubationtime_non(df):
#     # intcount와 intseq가 같고 extubationtime이 NaT인 행만 선택
#     mask_last_seq = (df['intcount'] == df['intseq']) & df['extubationtime'].isna()
    
#     # deathtime 또는 dischtime으로 extubationtime의 NaT 값을 대체
#     df.loc[mask_last_seq, 'extubationtime'] = df.loc[mask_last_seq, 'dischtime'].fillna(df.loc[mask_last_seq, 'deathtime'])
#     df.loc[mask_last_seq, 'extubationcause'] = 'dischtime'
    
#     # 그 외의 경우: 다음 seq의 intubationtime을 이전 seq의 extubationtime으로 대체
#     for idx, row in df.iterrows():
#         if row['intseq'] < row['intcount'] and pd.isna(row['extubationtime']):
#             next_row = df[(df['hadm_id'] == row['hadm_id']) & (df['intseq'] == row['intseq'] + 1)]
#             if not next_row.empty:
#                 df.at[idx, 'extubationtime'] = next_row['intubationtime'].values[0]
#                 df.at[idx, 'extubationcause'] = 'next_intubationtime'
    
#     return df

# # 데이터프레임에 함수 적용
# df = replace_extubationtime_non(df)


In [11]:
#만약 intubationtoextubationtime 이 음수이면서, intcount과 intseq가 같다면 dischtime 으로 extubationtime을 대체할 수 있다.
# intubationtoextubationtime이 음수인 경우 및 intcount와 intseq가 같은 경우, extubationtime을 dischtime으로 대체