In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 경로 설정 및 코호트 불러오기 (cohort_ver48)

In [5]:
import pandas as pd
import numpy as np
import os

# -----------------------------------------------------------
# 1. 경로 설정 (Path Configuration)
# -----------------------------------------------------------
# 기본 경로
base_path = '/content/drive/MyDrive/Colab Notebooks/2025-2 데이터애널리틱스'

# ED 데이터셋 폴더 경로 (mimid -> mimic으로 수정했습니다. 실제 폴더명 확인 필요)
ed_base_path = os.path.join(base_path, 'mimic-iv-ed-2.2')
edstays_csv_path = os.path.join(ed_base_path, 'edstays.csv')

# 코호트 파일 경로 (base_path 내에 있다고 가정)
cohort_csv_path = os.path.join(base_path, 'cohort_ver48_remove_trash_col.csv')

# -----------------------------------------------------------
# 2. 데이터 로드 (Data Loading)
# -----------------------------------------------------------
try:
    print(f"Loading ED Stays from: {edstays_csv_path}")
    df_edstays = pd.read_csv(edstays_csv_path, dtype={'subject_id': int, 'hadm_id': 'Int64', 'stay_id': int})
    print(f" -> edstays shape: {df_edstays.shape}")

    print(f"Loading Cohort from: {cohort_csv_path}")
    # [수정 완료] os.path.join 함수가 아닌, 완성된 문자열 경로(cohort_csv_path)를 넣었습니다.
    df_cohort = pd.read_csv(cohort_csv_path)
    print(f" -> cohort shape: {df_cohort.shape}")

except FileNotFoundError as e:
    print(f"\n[오류] 파일을 찾을 수 없습니다. 경로를 확인해주세요.\nError: {e}")
except Exception as e:
    print(f"\n[오류] 데이터 로드 중 문제가 발생했습니다.\nError: {e}")

# -----------------------------------------------------------
# 3. 날짜 컬럼 변환 (Datetime Conversion)
# -----------------------------------------------------------
if 'df_cohort' in locals():
    time_cols = [
        'ed_intime', 'ed_outtime', 'first_ecg_charttime',
        'first_troponin_charttime', 'first_antithrombotic_charttime',
        'first_stemi_ecg_time', 'first_troponin_positive_charttime',
        'pci_starttime', 'pci_endtime', 'dischtime'
    ]

    print("\n변환 중: 날짜 컬럼 datetime 변환...")
    for col in time_cols:
        if col in df_cohort.columns:
            df_cohort[col] = pd.to_datetime(df_cohort[col], errors='coerce')

    print("날짜 변환 완료.")

Loading ED Stays from: /content/drive/MyDrive/Colab Notebooks/2025-2 데이터애널리틱스/mimic-iv-ed-2.2/edstays.csv
 -> edstays shape: (425087, 9)
Loading Cohort from: /content/drive/MyDrive/Colab Notebooks/2025-2 데이터애널리틱스/cohort_ver48_remove_trash_col.csv
 -> cohort shape: (1930, 36)

변환 중: 날짜 컬럼 datetime 변환...
날짜 변환 완료.


# [1] event log 형태로 테이블 변환

In [7]:
# 2. 정적 데이터(arrival_transport) 인코딩 및 병합
# 만약 코호트에 이미 arrival_transport가 있다면 그대로 사용, 없다면 edstays와 조인
if 'arrival_transport' not in df_cohort.columns and 'df_edstays' in locals():
    df_cohort = df_cohort.merge(df_edstays[['hadm_id', 'arrival_transport']], on='hadm_id', how='left')

def encode_transport(x):
    if pd.isna(x): return 0
    x = str(x).upper()
    if 'WALK IN' in x: return 1
    elif 'AMBULANCE' in x: return 2
    elif 'HELICOPTER' in x: return 3
    else: return 0

# 인코딩 적용
if 'arrival_transport' in df_cohort.columns:
    df_cohort['arrival_transport_code'] = df_cohort['arrival_transport'].apply(encode_transport)

# 3. 정적 코호트 -> 동적 이벤트 로그로 변환 (Melting)
# 주요 타임스탬프를 이벤트로 정의하여 행(Row)을 늘립니다.
event_mapping = {
    'ed_intime': 'ED_ARRIVAL',
    'first_ecg_charttime': 'ECG_TAKEN',
    'first_troponin_charttime': 'TROP_TAKEN',
    'first_antithrombotic_charttime': 'ANTI_TAKEN',
    'first_stemi_ecg_time': 'STEMI_FLAG',
    'pci_starttime': 'PCI_START',
    'pci_endtime': 'PCI_END',
    'dischtime': 'DISCHARGE'
}

events_list = []
for col, event_name in event_mapping.items():
    if col in df_cohort.columns:
        temp = df_cohort[['subject_id', 'hadm_id', col]].dropna()
        temp.columns = ['subject_id', 'hadm_id', 'charttime']
        temp['event_name'] = event_name
        events_list.append(temp)

# 모든 이벤트를 합치고 정렬
df_events = pd.concat(events_list).sort_values(by=['subject_id', 'hadm_id', 'charttime'])

# 원본 코호트의 기준 시간(ed_intime 등)을 다시 병합 (시간 차이 계산용)
df_events = df_events.merge(df_cohort[['hadm_id', 'ed_intime', 'first_stemi_ecg_time', 'first_troponin_positive_charttime']], on='hadm_id', how='left')

df_events

Unnamed: 0,subject_id,hadm_id,charttime,event_name,ed_intime,first_stemi_ecg_time,first_troponin_positive_charttime
0,10000764,27897940,2132-10-14 19:31:00,ED_ARRIVAL,2132-10-14 19:31:00,2132-10-14 19:40:00,2132-10-15 07:45:00
1,10000764,27897940,2132-10-14 19:40:00,ECG_TAKEN,2132-10-14 19:31:00,2132-10-14 19:40:00,2132-10-15 07:45:00
2,10000764,27897940,2132-10-14 19:40:00,STEMI_FLAG,2132-10-14 19:31:00,2132-10-14 19:40:00,2132-10-15 07:45:00
3,10000764,27897940,2132-10-15 07:45:00,TROP_TAKEN,2132-10-14 19:31:00,2132-10-14 19:40:00,2132-10-15 07:45:00
4,10000764,27897940,2132-10-15 20:00:00,ANTI_TAKEN,2132-10-14 19:31:00,2132-10-14 19:40:00,2132-10-15 07:45:00
...,...,...,...,...,...,...,...
11170,19996783,21880161,2188-03-05 15:18:00,ECG_TAKEN,2188-05-09 05:02:00,NaT,2188-05-09 21:56:00
11171,19996783,21880161,2188-05-09 05:02:00,ED_ARRIVAL,2188-05-09 05:02:00,NaT,2188-05-09 21:56:00
11172,19996783,21880161,2188-05-09 21:56:00,TROP_TAKEN,2188-05-09 05:02:00,NaT,2188-05-09 21:56:00
11173,19996783,21880161,2188-05-09 23:00:00,ANTI_TAKEN,2188-05-09 05:02:00,NaT,2188-05-09 21:56:00


# [2] 동적 피처 엔지니어링

In [10]:
# -----------------------------------------------------------
# 4. 동적 피처 엔지니어링 (Dynamic Feature Engineering)
# -----------------------------------------------------------

# 4-1. Time Features
df_events['current_time'] = df_events['charttime']
# time_since_ed (분 단위)
df_events['time_since_ed'] = (df_events['current_time'] - df_events['ed_intime']).dt.total_seconds() / 60
# time_since_last (직전 이벤트로부터 경과 시간)
df_events['prev_event_time'] = df_events.groupby('hadm_id')['charttime'].shift(1)
df_events['time_since_last'] = (df_events['current_time'] - df_events['prev_event_time']).dt.total_seconds() / 60
df_events['time_since_last'] = df_events['time_since_last'].fillna(0) # 첫 이벤트는 0

# time_since_start_min (첫 이벤트로부터 경과 시간)
df_events['first_event_time'] = df_events.groupby('hadm_id')['charttime'].transform('min')
df_events['time_since_start_min'] = (df_events['current_time'] - df_events['first_event_time']).dt.total_seconds() / 60

# is_night (22시 ~ 07시)
df_events['hour'] = df_events['current_time'].dt.hour
df_events['is_night'] = df_events['hour'].apply(lambda h: 1 if (h >= 22 or h < 7) else 0)

# 4-2. Sequence Features
# prefix_len
df_events['prefix_len'] = df_events.groupby('hadm_id').cumcount() + 1

# prefix_events_str (수정됨)
# expanding() 대신 문자열 덧셈과 cumsum()을 사용하여 누적 문자열 생성
# 1. 각 이벤트 이름 뒤에 구분자 '>'를 붙임
# 2. 그룹별로 누적 합(문자열 연결)을 구함
# 3. 마지막에 붙은 불필요한 '>'를 제거
df_events['prefix_events_str'] = df_events.groupby('hadm_id')['event_name'].transform(
    lambda x: (x.astype(str) + '>').cumsum().str[:-1]
)

# 4-3. Count & State Features
# (주의: 제공된 코호트 파일에는 개별 트로포닌 수치가 없고 '총 횟수'만 있으므로, 여기서는 이벤트 발생 여부로 카운트합니다)
df_events['is_ecg'] = (df_events['event_name'] == 'ECG_TAKEN').astype(int)
df_events['cum_ecg_cnt'] = df_events.groupby('hadm_id')['is_ecg'].cumsum()

df_events['is_trop'] = (df_events['event_name'] == 'TROP_TAKEN').astype(int)
df_events['cum_trop_cnt'] = df_events.groupby('hadm_id')['is_trop'].cumsum()

# stemi_flag (확진 시점 이후면 1)
df_events['stemi_flag'] = 0
mask_stemi = (df_events['first_stemi_ecg_time'].notna()) & (df_events['current_time'] >= df_events['first_stemi_ecg_time'])
df_events.loc[mask_stemi, 'stemi_flag'] = 1

# trop_pos_flag (양성 판정 시점 이후면 1)
df_events['trop_pos_flag'] = 0
mask_trop_pos = (df_events['first_troponin_positive_charttime'].notna()) & (df_events['current_time'] >= df_events['first_troponin_positive_charttime'])
df_events.loc[mask_trop_pos, 'trop_pos_flag'] = 1

# pci_status
# 0:안함, 1:진행중, 2:종료
# 간단 로직: PCI_START 이벤트 시점=1, PCI_END 이후=2
df_events['pci_status'] = 0
df_events.loc[df_events['event_name'] == 'PCI_START', 'pci_status'] = 1
df_events.loc[df_events['event_name'] == 'PCI_END', 'pci_status'] = 2
# (실제로는 상태 유지 로직(forward fill)이 필요할 수 있으나, 이벤트 시점 기준으로는 위와 같이 태깅)

# 4-4. Pathway Stage Mapping
stage_map = {
    'ED_ARRIVAL': 0,
    'ECG_TAKEN': 1,
    'TROP_TAKEN': 1,
    'STEMI_FLAG': 2,
    'ANTI_TAKEN': 3,
    'PCI_START': 4,
    'PCI_END': 5,
    'DISCHARGE': 5
}
df_events['pathway_stage'] = df_events['event_name'].map(stage_map).fillna(0)

# 4-5. ID Mapping
event_id_map = {name: i for i, name in enumerate(stage_map.keys())}
df_events['current_event_id'] = df_events['event_name'].map(event_id_map)

# 4-6. Lab Values (last_trop, run_max_trop, trop_trend)
# 주의: 현재 코호트 파일에는 구체적인 Lab Value(검사 수치)가 없습니다.
# 실제 Lab 데이터가 있다면 merge 후 아래와 같이 처리합니다.
# df_events['last_trop'] = df_events.groupby('hadm_id')['trop_value'].ffill().fillna(0)
# df_events['run_max_trop'] = df_events.groupby('hadm_id')['trop_value'].expanding().max().fillna(0)
# df_events['trop_trend'] = df_events['last_trop'] - df_events.groupby('hadm_id')['last_trop'].shift(1).fillna(0)

# 최종 결과 확인
cols_order = ['subject_id', 'hadm_id', 'prefix_len', 'event_name', 'charttime', 'time_since_ed', 'stemi_flag', 'pathway_stage']
print(df_events[cols_order].head(10))

   subject_id   hadm_id  prefix_len  event_name           charttime  \
0    10000764  27897940           1  ED_ARRIVAL 2132-10-14 19:31:00   
1    10000764  27897940           2   ECG_TAKEN 2132-10-14 19:40:00   
2    10000764  27897940           3  STEMI_FLAG 2132-10-14 19:40:00   
3    10000764  27897940           4  TROP_TAKEN 2132-10-15 07:45:00   
4    10000764  27897940           5  ANTI_TAKEN 2132-10-15 20:00:00   
5    10000764  27897940           6   DISCHARGE 2132-10-19 16:30:00   
6    10010058  26359957           1   ECG_TAKEN 2119-04-03 09:52:00   
7    10010058  26359957           2  STEMI_FLAG 2143-10-21 10:40:00   
8    10010058  26359957           3  ED_ARRIVAL 2147-11-18 00:50:00   
9    10010058  26359957           4  ANTI_TAKEN 2147-11-18 05:00:00   

   time_since_ed  stemi_flag  pathway_stage  
0            0.0           0              0  
1            9.0           1              1  
2            9.0           1              2  
3          734.0           1      

# [3] Lab 수치 피처 불러오기 (hosp-icu DB)

In [13]:
import sqlite3
import pandas as pd
import numpy as np

# -----------------------------------------------------------
# [설정] DB 경로 및 타겟 설정
# -----------------------------------------------------------
db_path = '/content/drive/MyDrive/Colab Notebooks/2025-2 데이터애널리틱스/MIMIC4-hosp-icu.db'

# 트로포닌 Item ID (Troponin I: 51002, Troponin T: 51003)
trop_itemids = (51002, 51003)

# 코호트의 hadm_id 목록 추출 (SQL 필터링용)
# 1. NaN 제거
# 2. 정수형(int)으로 변환 (numpy int64가 아닌 python int)
# 3. 튜플로 변환
target_hadm_ids = tuple(map(int, df_cohort['hadm_id'].dropna().unique()))

# -----------------------------------------------------------
# 1. Lab 데이터 로드 (SQLite Query)
# -----------------------------------------------------------
print("DB에서 Troponin 데이터 조회 중...")

conn = sqlite3.connect(db_path)

# SQL 쿼리 작성 시, IN 절에 들어갈 문자열을 안전하게 생성합니다.
# 튜플을 문자열로 변환 (예: "(123, 456, ...)")
ids_str = str(target_hadm_ids)
itemids_str = str(trop_itemids)

# 쿼리문 (f-string 사용하되, 내용은 순수 숫자 문자열임)
query = f"""
SELECT subject_id, hadm_id, itemid, charttime, valuenum, flag
FROM labevents
WHERE itemid IN {itemids_str}
  AND hadm_id IN {ids_str}
"""

try:
    # 쿼리 실행 및 데이터프레임 로드
    df_trop = pd.read_sql_query(query, conn)
    print(f"Troponin 데이터 로드 완료: {len(df_trop)} 건")
except Exception as e:
    print(f"데이터 로드 중 오류 발생: {e}")
    # 오류 메시지를 자세히 보기 위해 쿼리 앞부분 출력 (디버깅용)
    print(f"Query snippet: {query[:200]} ...")
    df_trop = pd.DataFrame()
finally:
    conn.close()

# -----------------------------------------------------------
# (이후 Lab 피처 엔지니어링 코드는 기존과 동일하게 진행)
# -----------------------------------------------------------
if not df_trop.empty:
    # 날짜 변환 및 정렬
    df_trop['charttime'] = pd.to_datetime(df_trop['charttime'])
    df_trop = df_trop.sort_values(by=['hadm_id', 'charttime'])

    # 결측치 처리 (수치가 없는 경우 0 처리)
    df_trop['valuenum'] = df_trop['valuenum'].fillna(0)

    # [Feature 1] last_trop (현재 측정값)
    df_trop['last_trop'] = df_trop['valuenum']

    # [Feature 2] run_max_trop (현재 시점까지의 최대값 - Running Max)
    # expanding().max()를 사용하여 누적 최대값을 구합니다.
    df_trop['run_max_trop'] = df_trop.groupby('hadm_id')['valuenum'].expanding().max().reset_index(level=0, drop=True)

    # [Feature 3] trop_trend (현재값 - 직전값)
    # shift(1)을 사용하여 직전 검사 결과를 가져옵니다.
    df_trop['prev_trop'] = df_trop.groupby('hadm_id')['valuenum'].shift(1).fillna(0)
    df_trop['trop_trend'] = df_trop['last_trop'] - df_trop['prev_trop']

    # 이벤트 이름 부여 (기존 로그와 합치기 위해)
    df_trop['event_name'] = 'TROP_TAKEN'

    # -----------------------------------------------------------
    # 3. 기존 이벤트 로그와 병합 (Merge Strategy)
    # -----------------------------------------------------------
    # 앞 단계에서 생성한 df_events가 있다고 가정
    if 'df_events' in locals():
        # 3-1. 기존 로그에서 내용 없는 껍데기 'TROP_TAKEN' 제거
        df_events_no_trop = df_events[df_events['event_name'] != 'TROP_TAKEN'].copy()

        # 3-2. Lab 데이터에 필요한 컬럼만 남기기
        lab_cols = ['subject_id', 'hadm_id', 'charttime', 'event_name', 'last_trop', 'run_max_trop', 'trop_trend']
        df_trop_merge = df_trop[lab_cols]

        # 3-3. 병합 및 재정렬
        df_final = pd.concat([df_events_no_trop, df_trop_merge], ignore_index=True)
        df_final = df_final.sort_values(by=['subject_id', 'hadm_id', 'charttime'])

        # 3-4. 결측값 채우기 (Forward Fill)
        # 검사하지 않은 시점(예: ECG)에도 가장 최근의 Lab 수치를 유지
        lab_features = ['last_trop', 'run_max_trop']
        df_final[lab_features] = df_final.groupby('hadm_id')[lab_features].ffill().fillna(0)
        df_final['trop_trend'] = df_final['trop_trend'].fillna(0)

        # 3-5. Time Feature 재계산 (Merge로 인해 깨진 시간 정보 복구)
        df_final['current_time'] = df_final['charttime']

        if 'ed_intime' not in df_final.columns:
             df_final = df_final.merge(df_cohort[['hadm_id', 'ed_intime']], on='hadm_id', how='left')

        df_final['time_since_ed'] = (df_final['current_time'] - df_final['ed_intime']).dt.total_seconds() / 60

        print("최종 피처 테이블 생성 완료 (df_final)")
        print(df_final[['hadm_id', 'event_name', 'run_max_trop']].head())
    else:
        print("경고: df_events 변수가 정의되지 않았습니다. 이전 단계 코드를 먼저 실행하세요.")
else:
    print("경고: 해당 코호트의 Troponin 데이터를 찾을 수 없습니다.")

df_final

DB에서 Troponin 데이터 조회 중...
Troponin 데이터 로드 완료: 4013 건
최종 피처 테이블 생성 완료 (df_final)
        hadm_id  event_name  run_max_trop
0      27897940  ED_ARRIVAL          0.00
1      27897940   ECG_TAKEN          0.00
2      27897940  STEMI_FLAG          0.00
13042  27897940  TROP_TAKEN          0.04
3      27897940  ANTI_TAKEN          0.04


Unnamed: 0,subject_id,hadm_id,charttime,event_name,ed_intime,first_stemi_ecg_time,first_troponin_positive_charttime,current_time,time_since_ed,prev_event_time,...,is_trop,cum_trop_cnt,stemi_flag,trop_pos_flag,pci_status,pathway_stage,current_event_id,last_trop,run_max_trop,trop_trend
0,10000764,27897940,2132-10-14 19:31:00,ED_ARRIVAL,2132-10-14 19:31:00,2132-10-14 19:40:00,2132-10-15 07:45:00,2132-10-14 19:31:00,0.0,NaT,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00
1,10000764,27897940,2132-10-14 19:40:00,ECG_TAKEN,2132-10-14 19:31:00,2132-10-14 19:40:00,2132-10-15 07:45:00,2132-10-14 19:40:00,9.0,2132-10-14 19:31:00,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.00,0.00,0.00
2,10000764,27897940,2132-10-14 19:40:00,STEMI_FLAG,2132-10-14 19:31:00,2132-10-14 19:40:00,2132-10-15 07:45:00,2132-10-14 19:40:00,9.0,2132-10-14 19:40:00,...,0.0,0.0,1.0,0.0,0.0,2.0,3.0,0.00,0.00,0.00
13042,10000764,27897940,2132-10-15 07:45:00,TROP_TAKEN,NaT,NaT,NaT,2132-10-15 07:45:00,,NaT,...,,,,,,,,0.04,0.04,0.04
3,10000764,27897940,2132-10-15 20:00:00,ANTI_TAKEN,2132-10-14 19:31:00,2132-10-14 19:40:00,2132-10-15 07:45:00,2132-10-15 20:00:00,1469.0,2132-10-15 07:45:00,...,0.0,1.0,1.0,1.0,0.0,3.0,4.0,0.04,0.04,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9831,19996783,21880161,2188-05-09 05:02:00,ED_ARRIVAL,2188-05-09 05:02:00,NaT,2188-05-09 21:56:00,2188-05-09 05:02:00,0.0,2188-03-05 15:18:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00
10548,19996783,21880161,2188-05-09 21:56:00,TROP_TAKEN,NaT,NaT,NaT,2188-05-09 21:56:00,,NaT,...,,,,,,,,1.37,1.37,1.37
9832,19996783,21880161,2188-05-09 23:00:00,ANTI_TAKEN,2188-05-09 05:02:00,NaT,2188-05-09 21:56:00,2188-05-09 23:00:00,1078.0,2188-05-09 21:56:00,...,0.0,1.0,0.0,1.0,0.0,3.0,4.0,1.37,1.37,0.00
10549,19996783,21880161,2188-05-10 03:56:00,TROP_TAKEN,NaT,NaT,NaT,2188-05-10 03:56:00,,NaT,...,,,,,,,,1.92,1.92,0.55


In [20]:
# =============================================================================
# [Step 4.5] 병합 후 피처 재계산 (Repair Missing Values)
# =============================================================================
# 설명: Lab 데이터(df_trop)가 병합되면서 비어있는 ed_intime과 동적 피처들을 다시 채웁니다.

# 1. ed_intime 및 주요 시간 정보 채우기
# hadm_id별로 ed_intime이 같은 환자라면 동일하므로 forward fill/backward fill로 채웁니다.
time_fill_cols = ['ed_intime', 'first_stemi_ecg_time', 'first_troponin_positive_charttime']
for col in time_fill_cols:
    if col in df_final.columns:
        df_final[col] = df_final.groupby('hadm_id')[col].transform('first') # 그룹 내 첫 번째 값으로 채움

# 2. Time Features 재계산 (ed_intime이 채워졌으므로 가능)
df_final['current_time'] = df_final['charttime']
df_final['time_since_ed'] = (df_final['current_time'] - df_final['ed_intime']).dt.total_seconds() / 60

# prev_event_time & time_since_last 재계산 (새로 들어온 Lab 이벤트 순서 반영)
df_final = df_final.sort_values(by=['subject_id', 'hadm_id', 'charttime']) # 시간순 정렬 필수
df_final['prev_event_time'] = df_final.groupby('hadm_id')['charttime'].shift(1)
df_final['time_since_last'] = (df_final['current_time'] - df_final['prev_event_time']).dt.total_seconds() / 60
df_final['time_since_last'] = df_final['time_since_last'].fillna(0)

# is_night 재계산
df_final['hour'] = df_final['current_time'].dt.hour
df_final['is_night'] = df_final['hour'].apply(lambda h: 1 if (h >= 22 or h < 7) else 0)

# 3. Sequence Features 재계산 (중간에 끼어든 Lab 이벤트 포함하여 순서 다시 매김)
df_final['prefix_len'] = df_final.groupby('hadm_id').cumcount() + 1

# 4. Count & State Features 재계산
# (단순 ffill로는 중간에 끼어든 Lab 이벤트의 count가 갱신 안 될 수 있으므로 재계산 추천)
df_final['is_ecg'] = (df_final['event_name'] == 'ECG_TAKEN').astype(int)
df_final['cum_ecg_cnt'] = df_final.groupby('hadm_id')['is_ecg'].cumsum()

df_final['is_trop'] = (df_final['event_name'] == 'TROP_TAKEN').astype(int)
df_final['cum_trop_cnt'] = df_final.groupby('hadm_id')['is_trop'].cumsum()

# Flag 재계산
mask_stemi = (df_final['first_stemi_ecg_time'].notna()) & (df_final['current_time'] >= df_final['first_stemi_ecg_time'])
df_final['stemi_flag'] = mask_stemi.astype(int)

mask_trop_pos = (df_final['first_troponin_positive_charttime'].notna()) & (df_final['current_time'] >= df_final['first_troponin_positive_charttime'])
df_final['trop_pos_flag'] = mask_trop_pos.astype(int)

# Pathway Stage & ID 매핑 재계산
stage_map = {
    'ED_ARRIVAL': 0, 'ECG_TAKEN': 1, 'TROP_TAKEN': 1,
    'STEMI_FLAG': 2, 'ANTI_TAKEN': 3,
    'PCI_START': 4, 'PCI_END': 5, 'DISCHARGE': 5
}
df_final['pathway_stage'] = df_final['event_name'].map(stage_map).fillna(0)

event_id_map = {name: i for i, name in enumerate(stage_map.keys())}
df_final['current_event_id'] = df_final['event_name'].map(event_id_map)

# PCI Status 재계산 (forward fill 방식)
df_final['pci_status'] = 0
df_final.loc[df_final['event_name'] == 'PCI_START', 'pci_status'] = 1
df_final.loc[df_final['event_name'] == 'PCI_END', 'pci_status'] = 2

# 주의: PCI_END 이후에는 2가 유지되어야 하고, START 이전엔 0이어야 함.
# 위 단순 ffill은 START(1) -> END(2) 전이에는 문제없으나,
# 엄밀하게는 "현재 시점이 pci_endtime 이후인가?" 로직이 더 정확할 수 있음.
# 일단 위 로직으로도 대부분 커버 가능.

print("동적 피처 재계산(Repair) 완료.")

AttributeError: 'SeriesGroupBy' object has no attribute 'replace'

In [21]:
# =============================================================================
# [Step 4.5] 병합 후 피처 재계산 (Repair Missing Values)
# =============================================================================
# 설명: Lab 데이터(df_trop)가 병합되면서 비어있는 ed_intime과 동적 피처들을 다시 채웁니다.

# 1. ed_intime 및 주요 시간 정보 채우기
# hadm_id별로 ed_intime이 같은 환자라면 동일하므로 forward fill/backward fill로 채웁니다.
time_fill_cols = ['ed_intime', 'first_stemi_ecg_time', 'first_troponin_positive_charttime']
for col in time_fill_cols:
    if col in df_final.columns:
        df_final[col] = df_final.groupby('hadm_id')[col].transform('first') # 그룹 내 첫 번째 값으로 채움

# 2. Time Features 재계산 (ed_intime이 채워졌으므로 가능)
df_final['current_time'] = df_final['charttime']
df_final['time_since_ed'] = (df_final['current_time'] - df_final['ed_intime']).dt.total_seconds() / 60

# prev_event_time & time_since_last 재계산 (새로 들어온 Lab 이벤트 순서 반영)
df_final = df_final.sort_values(by=['subject_id', 'hadm_id', 'charttime']) # 시간순 정렬 필수
df_final['prev_event_time'] = df_final.groupby('hadm_id')['charttime'].shift(1)
df_final['time_since_last'] = (df_final['current_time'] - df_final['prev_event_time']).dt.total_seconds() / 60
df_final['time_since_last'] = df_final['time_since_last'].fillna(0)

# is_night 재계산
df_final['hour'] = df_final['current_time'].dt.hour
df_final['is_night'] = df_final['hour'].apply(lambda h: 1 if (h >= 22 or h < 7) else 0)

# 3. Sequence Features 재계산 (중간에 끼어든 Lab 이벤트 포함하여 순서 다시 매김)
df_final['prefix_len'] = df_final.groupby('hadm_id').cumcount() + 1

# 4. Count & State Features 재계산
# (단순 ffill로는 중간에 끼어든 Lab 이벤트의 count가 갱신 안 될 수 있으므로 재계산 추천)
df_final['is_ecg'] = (df_final['event_name'] == 'ECG_TAKEN').astype(int)
df_final['cum_ecg_cnt'] = df_final.groupby('hadm_id')['is_ecg'].cumsum()

df_final['is_trop'] = (df_final['event_name'] == 'TROP_TAKEN').astype(int)
df_final['cum_trop_cnt'] = df_final.groupby('hadm_id')['is_trop'].cumsum()

# Flag 재계산
mask_stemi = (df_final['first_stemi_ecg_time'].notna()) & (df_final['current_time'] >= df_final['first_stemi_ecg_time'])
df_final['stemi_flag'] = mask_stemi.astype(int)

mask_trop_pos = (df_final['first_troponin_positive_charttime'].notna()) & (df_final['current_time'] >= df_final['first_troponin_positive_charttime'])
df_final['trop_pos_flag'] = mask_trop_pos.astype(int)

# Pathway Stage & ID 매핑 재계산
stage_map = {
    'ED_ARRIVAL': 0, 'ECG_TAKEN': 1, 'TROP_TAKEN': 1,
    'STEMI_FLAG': 2, 'ANTI_TAKEN': 3,
    'PCI_START': 4, 'PCI_END': 5, 'DISCHARGE': 5
}
df_final['pathway_stage'] = df_final['event_name'].map(stage_map).fillna(0)

event_id_map = {name: i for i, name in enumerate(stage_map.keys())}
df_final['current_event_id'] = df_final['event_name'].map(event_id_map)

# PCI Status 재계산 (forward fill 방식)
df_final['pci_status'] = 0
df_final.loc[df_final['event_name'] == 'PCI_START', 'pci_status'] = 1
df_final.loc[df_final['event_name'] == 'PCI_END', 'pci_status'] = 2
df_final['pci_status'] = df_final.groupby('hadm_id')['pci_status'].replace(0, method='ffill').fillna(0)
# 주의: PCI_END 이후에는 2가 유지되어야 하고, START 이전엔 0이어야 함.
# 위 단순 ffill은 START(1) -> END(2) 전이에는 문제없으나,
# 엄밀하게는 "현재 시점이 pci_endtime 이후인가?" 로직이 더 정확할 수 있음.
# 일단 위 로직으로도 대부분 커버 가능.

print("동적 피처 재계산(Repair) 완료.")

AttributeError: 'SeriesGroupBy' object has no attribute 'replace'

In [17]:
import numpy as np
import pandas as pd

# -----------------------------------------------------------
# [설정] EOS(End of Sequence) 토큰 정의
# -----------------------------------------------------------
# df_final['current_event_id']가 이미 존재한다고 가정합니다.
EOS_TOKEN = int(df_final['current_event_id'].max()) + 1

# -----------------------------------------------------------
# 1. Delay Features (Raw Values)
# -----------------------------------------------------------
# 1-1. 각 환자의 '첫 이벤트 발생 시각' 테이블 만들기
first_times = df_final.groupby(['hadm_id', 'event_name'])['charttime'].min().unstack()

# 타겟 이벤트 정의
target_events = {
    'ECG_TAKEN': 'delay_ecg',
    'TROP_TAKEN': 'delay_trop',
    'ANTI_TAKEN': 'delay_med' # 실제 이벤트명: ANTI_PLT_ADMIN 등 확인 필요
}

# 병합을 위해 ed_intime 확보
if 'ed_intime' not in df_final.columns:
    df_final = df_final.merge(df_cohort[['hadm_id', 'ed_intime']], on='hadm_id', how='left')

# first_times 정보를 df_final에 병합
temp_delay = df_final[['hadm_id', 'current_time', 'ed_intime']].merge(
    first_times.rename(columns=lambda x: f"first_{x}_time"),
    on='hadm_id',
    how='left'
)

for evt_name, col_name in target_events.items():
    first_col = f"first_{evt_name}_time"

    if first_col in temp_delay.columns:
        # 계산된 지연 시간 (분 단위 Raw Value)
        fixed_delay = (temp_delay[first_col] - temp_delay['ed_intime']).dt.total_seconds() / 60

        # 조건: 이벤트 기록이 없거나(NaT), 현재 시각이 첫 이벤트보다 빠르면 -> -1
        # 그 외 -> fixed_delay 그대로 사용 (로그 변환 X)
        df_final[col_name] = np.where(
            (temp_delay[first_col].isna()) | (temp_delay['current_time'] < temp_delay[first_col]),
            -1,
            fixed_delay
        )
    else:
        df_final[col_name] = -1

# -----------------------------------------------------------
# 2. Cyclic Time Encoding (Feature Generation)
# -----------------------------------------------------------
# 이건 스케일링이라기보다 '새로운 정보 생성'이므로 유지하는 것을 추천합니다.
# (모델이 시간을 이해하는 데 필수적임)
df_final['hour_float'] = df_final['current_time'].dt.hour + df_final['current_time'].dt.minute / 60.0
df_final['sin_time'] = np.sin(2 * np.pi * df_final['hour_float'] / 24.0)
df_final['cos_time'] = np.cos(2 * np.pi * df_final['hour_float'] / 24.0)

# -----------------------------------------------------------
# 3. Target Generation (Shift -1)
# -----------------------------------------------------------
# [Target 1] Next Event ID (다음 행의 ID, 없으면 EOS)
df_final['target_next_evt'] = df_final.groupby('hadm_id')['current_event_id'].shift(-1).fillna(EOS_TOKEN).astype(int)

# [Target 2] Time to Next Event (분 단위 Raw Value)
next_time = df_final.groupby('hadm_id')['current_time'].shift(-1)
df_final['target_time_to_next'] = (next_time - df_final['current_time']).dt.total_seconds() / 60
df_final['target_time_to_next'] = df_final['target_time_to_next'].fillna(0)
# (로그 변환 제거됨)

# [Target 3] Remain LOS (시간 단위 Raw Value)
if 'dischtime' not in df_final.columns:
     df_final = df_final.merge(df_cohort[['hadm_id', 'dischtime']], on='hadm_id', how='left')

df_final['target_remain_los'] = (df_final['dischtime'] - df_final['current_time']).dt.total_seconds() / 3600
df_final['target_remain_los'] = df_final['target_remain_los'].fillna(0)
df_final.loc[df_final['target_remain_los'] < 0, 'target_remain_los'] = 0

# [Target 4] Mortality (Binary)
if 'death_flag' not in df_final.columns:
    df_final = df_final.merge(df_cohort[['hadm_id', 'death_flag']], on='hadm_id', how='left')
df_final['target_mortality'] = df_final['death_flag'].fillna(0).astype(int)

# -----------------------------------------------------------
# 4. Final Master Table Construction (수정됨)
# -----------------------------------------------------------
final_columns = [
    # Key (3개)
    'subject_id', 'hadm_id', 'prefix_len',

    # Static Features (6개) - ★여기가 빠졌었습니다!
    'age', 'gender', 'race', 'arrival_transport_code',
    'cci_score', 'hfrs_score',

    # Dynamic - Sequence (1개)
    'current_event_id',

    # Dynamic - Time (Raw) (4개)
    'time_since_ed', 'sin_time', 'cos_time', 'is_night',

    # Dynamic - Delay (Raw) (3개)
    'delay_ecg', 'delay_trop', 'delay_med',

    # Dynamic - State (4개)
    'stemi_flag', 'trop_pos_flag', 'pci_status', 'pathway_stage',

    # Dynamic - Lab (Raw) (5개)
    'last_trop', 'run_max_trop', 'trop_trend',
    'cum_ecg_cnt', 'cum_trop_cnt',

    # Targets (4개)
    'target_next_evt', 'target_time_to_next', 'target_remain_los', 'target_mortality'
]

# (중요) 정적 컬럼들이 df_final에 있는지 확인하고, 없으면 코호트에서 merge해야 합니다.
static_cols_to_merge = ['age', 'gender', 'race', 'cci_score', 'hfrs_score', 'arrival_transport_code']
missing_static = [c for c in static_cols_to_merge if c not in df_final.columns]

if missing_static:
    print(f"Merge missing static cols: {missing_static}")
    df_final = df_final.merge(df_cohort[['hadm_id'] + missing_static], on='hadm_id', how='left')

# 최종 선택
existing_cols = [c for c in final_columns if c in df_final.columns]
df_master = df_final[existing_cols].copy()

print(f"최종 컬럼 개수: {len(df_master.columns)}")
print(df_master.columns.tolist())
df_master

최종 컬럼 개수: 30
['subject_id', 'hadm_id', 'prefix_len', 'age', 'gender', 'race', 'arrival_transport_code', 'cci_score', 'hfrs_score', 'current_event_id', 'time_since_ed', 'sin_time', 'cos_time', 'is_night', 'delay_ecg', 'delay_trop', 'delay_med', 'stemi_flag', 'trop_pos_flag', 'pci_status', 'pathway_stage', 'last_trop', 'run_max_trop', 'trop_trend', 'cum_ecg_cnt', 'cum_trop_cnt', 'target_next_evt', 'target_time_to_next', 'target_remain_los', 'target_mortality']


Unnamed: 0,subject_id,hadm_id,prefix_len,age,gender,race,arrival_transport_code,cci_score,hfrs_score,current_event_id,...,pathway_stage,last_trop,run_max_trop,trop_trend,cum_ecg_cnt,cum_trop_cnt,target_next_evt,target_time_to_next,target_remain_los,target_mortality
0,10000764,27897940,1.0,86,M,WHITE,0,0,0.0,0.0,...,0.0,0.00,0.00,0.00,0.0,0.0,1,9.0,116.983333,0
1,10000764,27897940,2.0,86,M,WHITE,0,0,0.0,1.0,...,1.0,0.00,0.00,0.00,1.0,0.0,3,0.0,116.833333,0
2,10000764,27897940,3.0,86,M,WHITE,0,0,0.0,3.0,...,2.0,0.00,0.00,0.00,1.0,0.0,8,725.0,116.833333,0
3,10000764,27897940,,86,M,WHITE,0,0,0.0,,...,,0.04,0.04,0.04,,,4,735.0,104.750000,0
4,10000764,27897940,5.0,86,M,WHITE,0,0,0.0,4.0,...,3.0,0.04,0.04,0.00,1.0,1.0,8,603.0,92.500000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14307,19996783,21880161,2.0,89,M,ASIAN - CHINESE,1,1,15.1,0.0,...,0.0,0.00,0.00,0.00,1.0,0.0,8,1014.0,253.116667,1
14308,19996783,21880161,,89,M,ASIAN - CHINESE,1,1,15.1,,...,,1.37,1.37,1.37,,,4,64.0,236.216667,1
14309,19996783,21880161,4.0,89,M,ASIAN - CHINESE,1,1,15.1,4.0,...,3.0,1.37,1.37,0.00,1.0,1.0,8,296.0,235.150000,1
14310,19996783,21880161,,89,M,ASIAN - CHINESE,1,1,15.1,,...,,1.92,1.92,0.55,,,7,13813.0,230.216667,1


# [4] 결측치 파악

In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1. 결측치 개수 및 비율 확인 함수
def check_missing_values(df):
    missing = df.isnull().sum()
    missing = missing[missing > 0] # 결측치 있는 것만 추출
    if missing.empty:
        print("🎉 결측치가 없습니다! 완벽합니다.")
        return None

    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Percentage': (missing / len(df)) * 100
    })
    missing_df = missing_df.sort_values(by='Percentage', ascending=False)

    print("=== [EDA] 컬럼별 결측치 현황 ===")
    print(missing_df)

    return missing_df

# 실행
missing_report = check_missing_values(df_master)

=== [EDA] 컬럼별 결측치 현황 ===
                  Missing Count  Percentage
prefix_len                 4058   28.353829
current_event_id           4058   28.353829
time_since_ed              4058   28.353829
is_night                   4058   28.353829
delay_trop                 4058   28.353829
cum_ecg_cnt                4058   28.353829
stemi_flag                 4058   28.353829
trop_pos_flag              4058   28.353829
pci_status                 4058   28.353829
cum_trop_cnt               4058   28.353829
pathway_stage              4058   28.353829
delay_ecg                  3904   27.277809
delay_med                  3308   23.113471
