In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [31]:
# 1. 깃 레파지토리 클론 실행
!git clone https://github.com/sosomeet/DA_STEMI.git

# 2. 클론된 DA_STEMI 폴더로 이동
%cd DA_STEMI

# 3. 중복 폴더(DA_STEMI/DA_STEMI)가 있을 경우 내용물을 상위 폴더로 이동 후 삭제
!if [ -d "DA_STEMI" ]; then mv DA_STEMI/* . && mv DA_STEMI/.??* . 2>/dev/null && rm -r DA_STEMI; fi

# 4. 파일 목록 최종 확인
!ls

fatal: destination path 'DA_STEMI' already exists and is not an empty directory.
/content/drive/MyDrive/DA_STEMI
 cohort
'cohort EDA(1).ipynb'
 cohort_EDA_report.html
 cohort.ipynb
 cohort_ver20_expanded_STEMI.ipynb
'cohort_ver21_expanded_STEMI(2).ipynb'
 make_cohort_ver10.ipynb
 make_cohort_ver11.ipynb
 make_cohort_ver7.ipynb
 make_cohort_ver8.ipynb
 make_cohort_ver9.ipynb
 missing_as_signal_EDA.ipynb
 process_classfication_and_Event_log.ipynb
 process_variation_analysis.ipynb
 README.md
 timevars_analysis.ipynb
 Timevars.ipynb


In [32]:
# 1. 사용자 이메일 설정
!git config --global user.email "ji85673@gmail.com"

# 2. 사용자 이름 설정
!git config --global user.name "ji85673"

In [33]:
import pandas as pd
import os

BASE_PATH = "/content/drive/MyDrive/DA_STEMI/cohort"
COHORT_FILE = "cohort_ver21_expanded_STEMI(2).csv"

cohort_path = os.path.join(BASE_PATH, COHORT_FILE)

df_stemi = pd.read_csv(cohort_path)
df_stemi.head()


Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition,first_ecg_time,...,door_to_antithrombotic_min,door_to_cath_min,boarding_delay_min,death_flag,los_days,transfer_type,is_emergency_admission,dod,eventtype,cohort_source
0,10010058,26359957.0,39919549,2147-11-18 00:50:00,2147-11-18 03:19:00,M,ASIAN - SOUTH EAST ASIAN,AMBULANCE,ADMITTED,,...,,,,1.0,1.232326,,,2147-11-19,discharge,ICD_Confirmed
1,10012438,22764825.0,35750515,2178-06-07 19:33:00,2178-06-07 21:57:00,M,WHITE,AMBULANCE,ADMITTED,,...,,,,0.0,0.746007,,,,discharge,ICD_Confirmed
2,10013310,21243435.0,31736720,2153-05-26 08:56:00,2153-05-26 14:18:39,F,BLACK/AFRICAN,UNKNOWN,ADMITTED,,...,,,,1.0,6.296898,,,2153-11-19,discharge,ICD_Confirmed
3,10013310,27682188.0,35160955,2153-05-06 10:21:00+00:00,2153-05-06 18:28:00+00:00,F,BLACK/AFRICAN,AMBULANCE,ADMITTED,2153-05-06 10:25,...,,484.0,-25.0,1.0,6.82,HOME HEALTH CARE,1.0,2153-11-19,discharge,ICD_Confirmed
4,10033552,26487381.0,32542054,2136-02-29 22:09:00+00:00,2136-03-01 02:26:37+00:00,F,OTHER,AMBULANCE,ADMITTED,2136-02-29 22:21,...,,,-1.616667,1.0,2.75,HOME,0.0,2141-07-15,discharge,ICD_Confirmed


In [34]:
import sqlite3

# 너 DB 실제 위치에 맞게 수정해서 쓰면 돼
DB_PATH = "/content/drive/MyDrive/MIMIC4-hosp-icu.db"  # 경로 다르면 여기만 바꾸기

conn = sqlite3.connect(DB_PATH)
conn


<sqlite3.Connection at 0x7caa533eb6a0>

In [35]:
import pandas as pd

tables = pd.read_sql_query(
    "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
    conn
)
tables


Unnamed: 0,name
0,admissions
1,caregiver
2,chartevents
3,d_hcpcs
4,d_icd_diagnoses
5,d_icd_procedures
6,d_items
7,d_labitems
8,datetimeevents
9,diagnoses_icd


In [36]:
# ⚠️ 아래 리스트는 예시. 팀에서 정한 NSTEMI 코드로 교체하면 된다.
# NSTEMI ICD-9
nstemi_icd9 = [
    "4107",     # 점 제거한 버전
    "41070",
    "41071",
    "41072"
]

# NSTEMI ICD-10
nstemi_icd10 = [
    "I214"      # I21.4 (NSTEMI)
]


icd9_list = ",".join(f"'{c}'" for c in nstemi_icd9)
icd10_list = ",".join(f"'{c}'" for c in nstemi_icd10)

query_nstemi = f"""
SELECT DISTINCT
    d.subject_id,
    d.hadm_id,
    d.icd_code,
    d.icd_version
FROM diagnoses_icd d
WHERE
    (d.icd_version = 9 AND REPLACE(d.icd_code, '.', '') IN ({icd9_list}))
    OR
    (d.icd_version = 10 AND REPLACE(d.icd_code, '.', '') IN ({icd10_list}));
"""

df_nstemi_diag = pd.read_sql_query(query_nstemi, conn)

print("▶ NSTEMI 진단 환자 수:", df_nstemi_diag['hadm_id'].nunique())
df_nstemi_diag.head()


▶ NSTEMI 진단 환자 수: 10248


Unnamed: 0,subject_id,hadm_id,icd_code,icd_version
0,10000764,27897940,41071,9
1,10000980,26913865,41071,9
2,10001492,27463908,41071,9
3,10002013,24760295,41071,9
4,10002495,24982426,I214,10


In [37]:
# 팀원이 만든 STEMI 코호트에서 hadm_id 집합 뽑기
stemi_hadm = set(df_stemi['hadm_id'].dropna().unique())

# STEMI에 이미 포함된 입원건(hadm_id)은 제외 → 순수 NSTEMI만 남기기
df_nstemi_only = df_nstemi_diag[~df_nstemi_diag['hadm_id'].isin(stemi_hadm)].copy()

print("▶ 순수 NSTEMI hadm 수:", df_nstemi_only['hadm_id'].nunique())
df_nstemi_only.head()


▶ 순수 NSTEMI hadm 수: 10147


Unnamed: 0,subject_id,hadm_id,icd_code,icd_version
1,10000980,26913865,41071,9
2,10001492,27463908,41071,9
3,10002013,24760295,41071,9
4,10002495,24982426,I214,10
5,10002667,23197839,I214,10


In [38]:
import pandas as pd

ED_PATH = "/content/drive/MyDrive/mimic-iv-ed-2.2/ed/edstays.csv.gz"

df_ed_raw = pd.read_csv(ED_PATH, compression="gzip", low_memory=False)
df_ed_raw.head()
df_ed_raw.dtypes


Unnamed: 0,0
subject_id,int64
hadm_id,float64
stay_id,int64
intime,object
outtime,object
gender,object
race,object
arrival_transport,object
disposition,object


In [39]:
# STEMI ICD-9
stemi_icd9 = [
    "4100","41000","41001","41002",
    "4101","41010","41011","41012",
    "4102","41020","41021","41022",
    "4103","41030","41031","41032",
    "4104","41040","41041","41042",
    "4105","41050","41051","41052",
    "4106","41060","41061","41062",
    "4108","41080","41081","41082"
]

# STEMI ICD-10
stemi_icd10 = ["I210","I211","I212","I213"]

icd9_list = ",".join(f"'{c}'" for c in stemi_icd9)
icd10_list = ",".join(f"'{c}'" for c in stemi_icd10)

query_stemi = f"""
SELECT DISTINCT
    d.subject_id,
    d.hadm_id,
    d.icd_code,
    d.icd_version
FROM diagnoses_icd d
WHERE
    (d.icd_version = 9 AND REPLACE(d.icd_code, '.', '') IN ({icd9_list}))
    OR
    (d.icd_version = 10 AND REPLACE(d.icd_code, '.', '') IN ({icd10_list}));
"""


In [40]:
df_stemi_diag = pd.read_sql_query(query_stemi, conn)

original_stemi_hadm = set(df_stemi['hadm_id'].unique())

df_stemi_only_new = df_stemi_diag[~df_stemi_diag['hadm_id'].isin(original_stemi_hadm)].copy()

print("▶ 새롭게 발견된 STEMI hadm:", df_stemi_only_new['hadm_id'].nunique())
df_stemi_only_new.head()


▶ 새롭게 발견된 STEMI hadm: 1105


Unnamed: 0,subject_id,hadm_id,icd_code,icd_version
0,10002155,23822395,41011,9
1,10009686,29681222,41021,9
4,10038849,28195068,41011,9
5,10042377,25834187,41052,9
6,10045960,21342513,41051,9


In [41]:
# 기존 코호트 데이터프레임 컬럼 확인
print("📌 기존 STEMI 코호트 컬럼 리스트:")
print(df_stemi.columns.tolist())

# 컬럼별 데이터 타입도 확인
print("\n📌 dtype 확인:")
print(df_stemi.dtypes)

# 상위 5행 확인
df_stemi.head()


📌 기존 STEMI 코호트 컬럼 리스트:
['subject_id', 'hadm_id', 'stay_id', 'intime', 'outtime', 'gender', 'race', 'arrival_transport', 'disposition', 'first_ecg_time', 'door_to_ecg_min', 'admittime', 'lab_tat_min', 'door_to_antithrombotic_min', 'door_to_cath_min', 'boarding_delay_min', 'death_flag', 'los_days', 'transfer_type', 'is_emergency_admission', 'dod', 'eventtype', 'cohort_source']

📌 dtype 확인:
subject_id                      int64
hadm_id                       float64
stay_id                         int64
intime                         object
outtime                        object
gender                         object
race                           object
arrival_transport              object
disposition                    object
first_ecg_time                 object
door_to_ecg_min               float64
admittime                      object
lab_tat_min                   float64
door_to_antithrombotic_min    float64
door_to_cath_min              float64
boarding_delay_min            float64
d

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition,first_ecg_time,...,door_to_antithrombotic_min,door_to_cath_min,boarding_delay_min,death_flag,los_days,transfer_type,is_emergency_admission,dod,eventtype,cohort_source
0,10010058,26359957.0,39919549,2147-11-18 00:50:00,2147-11-18 03:19:00,M,ASIAN - SOUTH EAST ASIAN,AMBULANCE,ADMITTED,,...,,,,1.0,1.232326,,,2147-11-19,discharge,ICD_Confirmed
1,10012438,22764825.0,35750515,2178-06-07 19:33:00,2178-06-07 21:57:00,M,WHITE,AMBULANCE,ADMITTED,,...,,,,0.0,0.746007,,,,discharge,ICD_Confirmed
2,10013310,21243435.0,31736720,2153-05-26 08:56:00,2153-05-26 14:18:39,F,BLACK/AFRICAN,UNKNOWN,ADMITTED,,...,,,,1.0,6.296898,,,2153-11-19,discharge,ICD_Confirmed
3,10013310,27682188.0,35160955,2153-05-06 10:21:00+00:00,2153-05-06 18:28:00+00:00,F,BLACK/AFRICAN,AMBULANCE,ADMITTED,2153-05-06 10:25,...,,484.0,-25.0,1.0,6.82,HOME HEALTH CARE,1.0,2153-11-19,discharge,ICD_Confirmed
4,10033552,26487381.0,32542054,2136-02-29 22:09:00+00:00,2136-03-01 02:26:37+00:00,F,OTHER,AMBULANCE,ADMITTED,2136-02-29 22:21,...,,,-1.616667,1.0,2.75,HOME,0.0,2141-07-15,discharge,ICD_Confirmed


In [42]:
# 기존 코호트 컬럼 리스트 가져오기
stemi_cols = df_stemi.columns.tolist()

# 새롭게 찾은 STEMI 신규 환자 df_stemi_only_new의 기본 구조 확장
df_new = df_stemi_only_new.copy()

# 기존 코호트에 없는 컬럼들을 신규 df에 추가하고 NaN으로 채우기
for col in stemi_cols:
    if col not in df_new.columns:
        df_new[col] = pd.NA

# 기존 컬럼 순서와 동일하게 맞추기
df_new = df_new[stemi_cols]

df_new.head()


Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition,first_ecg_time,...,door_to_antithrombotic_min,door_to_cath_min,boarding_delay_min,death_flag,los_days,transfer_type,is_emergency_admission,dod,eventtype,cohort_source
0,10002155,23822395,,,,,,,,,...,,,,,,,,,,
1,10009686,29681222,,,,,,,,,...,,,,,,,,,,
4,10038849,28195068,,,,,,,,,...,,,,,,,,,,
5,10042377,25834187,,,,,,,,,...,,,,,,,,,,
6,10045960,21342513,,,,,,,,,...,,,,,,,,,,


In [43]:
df_stemi_expanded = pd.concat([df_stemi, df_new], ignore_index=True)

print("합친 후 shape:", df_stemi_expanded.shape)
df_stemi_expanded.head()


합친 후 shape: (3098, 23)


  df_stemi_expanded = pd.concat([df_stemi, df_new], ignore_index=True)


Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition,first_ecg_time,...,door_to_antithrombotic_min,door_to_cath_min,boarding_delay_min,death_flag,los_days,transfer_type,is_emergency_admission,dod,eventtype,cohort_source
0,10010058,26359957.0,39919549,2147-11-18 00:50:00,2147-11-18 03:19:00,M,ASIAN - SOUTH EAST ASIAN,AMBULANCE,ADMITTED,,...,,,,1.0,1.232326,,,2147-11-19,discharge,ICD_Confirmed
1,10012438,22764825.0,35750515,2178-06-07 19:33:00,2178-06-07 21:57:00,M,WHITE,AMBULANCE,ADMITTED,,...,,,,0.0,0.746007,,,,discharge,ICD_Confirmed
2,10013310,21243435.0,31736720,2153-05-26 08:56:00,2153-05-26 14:18:39,F,BLACK/AFRICAN,UNKNOWN,ADMITTED,,...,,,,1.0,6.296898,,,2153-11-19,discharge,ICD_Confirmed
3,10013310,27682188.0,35160955,2153-05-06 10:21:00+00:00,2153-05-06 18:28:00+00:00,F,BLACK/AFRICAN,AMBULANCE,ADMITTED,2153-05-06 10:25,...,,484.0,-25.0,1.0,6.82,HOME HEALTH CARE,1.0,2153-11-19,discharge,ICD_Confirmed
4,10033552,26487381.0,32542054,2136-02-29 22:09:00+00:00,2136-03-01 02:26:37+00:00,F,OTHER,AMBULANCE,ADMITTED,2136-02-29 22:21,...,,,-1.616667,1.0,2.75,HOME,0.0,2141-07-15,discharge,ICD_Confirmed


In [44]:
save_path = "/content/drive/MyDrive/DA_STEMI/cohort/cohort_ver22_STEMI_expanded.csv"
df_stemi_expanded.to_csv(save_path, index=False)

print("저장 완료:", save_path)


저장 완료: /content/drive/MyDrive/DA_STEMI/cohort/cohort_ver22_STEMI_expanded.csv


In [45]:
df_1105 = df_stemi_only_new.copy()


In [46]:
df_1105 = df_1105.merge(
    df_ed[['subject_id','hadm_id','ed_stay_id','ed_intime','ed_outtime','disposition']],
    on=['subject_id','hadm_id'],
    how='left'
)

print("ED 방문 있는 사람:", df_1105['ed_stay_id'].notna().sum())
print("ED 없음:", df_1105['ed_stay_id'].isna().sum())


ED 방문 있는 사람: 54
ED 없음: 1061


In [47]:
df_1105 = df_1105.merge(
    df_icu[['subject_id','hadm_id','icu_stay_id','icu_intime']],
    on=['subject_id','hadm_id'],
    how='left'
)

print("ICU 방문 있는 사람:", df_1105['icu_stay_id'].notna().sum())
print("ICU 없음:", df_1105['icu_stay_id'].isna().sum())


ICU 방문 있는 사람: 811
ICU 없음: 394


In [48]:
df_1105 = df_1105.merge(
    df_adm[['subject_id','hadm_id','admittime']],
    on=['subject_id','hadm_id'],
    how='left'
)

df_1105['admittime'] = pd.to_datetime(df_1105['admittime'])
df_1105['icu_intime'] = pd.to_datetime(df_1105['icu_intime'])
df_1105['ed_intime'] = pd.to_datetime(df_1105['ed_intime'])


In [49]:
# PCI 시술 코드 목록 예시
pci_codes9 = ["3606", "3607", "3605", "3601"]
pci_codes10 = ["02703ZZ", "02713ZZ", "02723ZZ", "02733ZZ"]

df_proc = pd.read_sql_query("""
SELECT subject_id, hadm_id, icd_code, icd_version
FROM procedures_icd
""", conn)

df_proc['icd_code_clean'] = df_proc['icd_code'].str.replace('.','', regex=False)

df_pci = df_proc[
    ((df_proc['icd_version']==9) & (df_proc['icd_code_clean'].isin(pci_codes9)))
  | ((df_proc['icd_version']==10) & (df_proc['icd_code_clean'].isin(pci_codes10)))
]

df_pci = df_pci[['subject_id','hadm_id']].drop_duplicates()

df_1105 = df_1105.merge(df_pci, on=['subject_id','hadm_id'], how='left', indicator='pci_mark')

df_1105['has_pci'] = (df_1105['pci_mark']=="both")
print(df_1105['has_pci'].value_counts())


has_pci
False    684
True     521
Name: count, dtype: int64


In [50]:
df_1105['valid_time'] = True

df_1105.loc[
    (df_1105['ed_intime'].notna()) & (df_1105['admittime'].notna()) &
    (df_1105['ed_intime'] > df_1105['admittime']),
    'valid_time'
] = False

df_1105.loc[
    (df_1105['ed_intime'].notna()) & (df_1105['icu_intime'].notna()) &
    (df_1105['ed_intime'] > df_1105['icu_intime']),
    'valid_time'
] = False

print(df_1105['valid_time'].value_counts())


valid_time
True    1205
Name: count, dtype: int64


In [51]:
df_pure = df_1105[
    (df_1105['ed_stay_id'].notna()) &
    (df_1105['icu_stay_id'].notna()) &
    (df_1105['has_pci'] == True) &
    (df_1105['valid_time'] == True)
]

print("최종 살아남은 진성 STEMI 후보 수:", len(df_pure))
df_pure.head()


최종 살아남은 진성 STEMI 후보 수: 2


Unnamed: 0,subject_id,hadm_id,icd_code,icd_version,ed_stay_id,ed_intime,ed_outtime,disposition,icu_stay_id,icu_intime,admittime,pci_mark,has_pci,valid_time
418,13104348,27693989,41042,9,38671493.0,2186-09-06 11:04:00,2186-09-06 17:25:00,ADMITTED,34595462.0,2186-09-22 10:07:51,2186-09-06 15:24:00,both,True,True
419,13104348,27693989,41042,9,38671493.0,2186-09-06 11:04:00,2186-09-06 17:25:00,ADMITTED,37158575.0,2186-09-11 18:36:15,2186-09-06 15:24:00,both,True,True


In [52]:
# 기존 STEMI 코호트 컬럼
stemi_cols = df_stemi.columns.tolist()

# df_pure를 기존 코호트 컬럼 구조에 맞춰 확장
df_pure_aligned = df_pure.copy()

for col in stemi_cols:
    if col not in df_pure_aligned.columns:
        df_pure_aligned[col] = pd.NA

# 컬럼 순서 동일하게 맞추기
df_pure_aligned = df_pure_aligned[stemi_cols]


In [53]:
df_stemi_updated = pd.concat([df_stemi, df_pure_aligned], ignore_index=True)

print("업데이트된 코호트 크기:", df_stemi_updated.shape)
df_stemi_updated.tail()


업데이트된 코호트 크기: (1985, 23)


  df_stemi_updated = pd.concat([df_stemi, df_pure_aligned], ignore_index=True)


Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition,first_ecg_time,...,door_to_antithrombotic_min,door_to_cath_min,boarding_delay_min,death_flag,los_days,transfer_type,is_emergency_admission,dod,eventtype,cohort_source
1980,19950864,22572134.0,30618056.0,2130-07-13 12:51:00,2130-07-14 00:04:00,M,BLACK/CARIBBEAN ISLAND,,ADMITTED,,...,,,,,,,,,,ECG_Text_Mining
1981,19970838,23840655.0,35514872.0,2116-07-04 02:32:00,2116-07-04 06:29:00,M,WHITE,,ADMITTED,,...,,,,,,,,,,ECG_Text_Mining
1982,19993842,,37776073.0,2148-12-08 17:23:00,2148-12-08 22:11:00,M,,,EXPIRED,,...,,,,,,,,,,ECG_Text_Mining
1983,13104348,27693989.0,,,,,,,ADMITTED,,...,,,,,,,,,,
1984,13104348,27693989.0,,,,,,,ADMITTED,,...,,,,,,,,,,


In [54]:
save_path = "/content/drive/MyDrive/DA_STEMI/cohort/cohort_ver22_expended_STEMI(3).csv"
df_stemi_updated.to_csv(save_path, index=False)

print("저장 완료:", save_path)


저장 완료: /content/drive/MyDrive/DA_STEMI/cohort/cohort_ver22_expended_STEMI(3).csv


In [55]:
import pandas as pd

COHORT_PATH = "/content/drive/MyDrive/DA_STEMI/cohort/cohort_ver22_expended_STEMI(3).csv"

df_stemi_v22 = pd.read_csv(COHORT_PATH)
df_stemi_v22.head()
print(df_stemi_v22.shape)



(1985, 23)


In [61]:
import pandas as pd
import sqlite3

# === 경로 세팅 ===
DB_PATH = "/content/drive/MyDrive/MIMIC4-hosp-icu.db"  # 너가 쓰는 hosp+icu DB
ED_PATH = "/content/drive/MyDrive/mimic-iv-ed-2.2/ed/edstays.csv.gz"

# === DB 연결 ===
conn = sqlite3.connect(DB_PATH)

# === ED stays 불러오기 ===
df_ed_raw = pd.read_csv(ED_PATH, compression="gzip", low_memory=False)
df_ed_raw.head()



Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition
0,10000032,22595853.0,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,WHITE,AMBULANCE,ADMITTED
1,10000032,22841357.0,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,WHITE,AMBULANCE,ADMITTED
2,10000032,25742920.0,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,WHITE,AMBULANCE,ADMITTED
3,10000032,29079034.0,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,WHITE,AMBULANCE,HOME
4,10000032,29079034.0,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,WHITE,AMBULANCE,ADMITTED


In [62]:
# 필요한 컬럼만 사용 & 컬럼명 정리
df_ed = df_ed_raw[[
    "subject_id", "hadm_id", "stay_id",
    "intime", "outtime",
    "arrival_transport", "disposition"
]].copy()

df_ed = df_ed.rename(columns={
    "stay_id": "ed_stay_id",
    "intime": "ed_intime",
    "outtime": "ed_outtime"
})

# 시간형으로 변환
df_ed["ed_intime"] = pd.to_datetime(df_ed["ed_intime"], errors="coerce")
df_ed["ed_outtime"] = pd.to_datetime(df_ed["ed_outtime"], errors="coerce")

df_ed.head()


Unnamed: 0,subject_id,hadm_id,ed_stay_id,ed_intime,ed_outtime,arrival_transport,disposition
0,10000032,22595853.0,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,AMBULANCE,ADMITTED
1,10000032,22841357.0,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,AMBULANCE,ADMITTED
2,10000032,25742920.0,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,AMBULANCE,ADMITTED
3,10000032,29079034.0,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,AMBULANCE,HOME
4,10000032,29079034.0,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,AMBULANCE,ADMITTED


In [63]:
# 필요한 컬럼만 사용 & 컬럼명 정리
df_ed = df_ed_raw[[
    "subject_id", "hadm_id", "stay_id",
    "intime", "outtime",
    "arrival_transport", "disposition"
]].copy()

df_ed = df_ed.rename(columns={
    "stay_id": "ed_stay_id",
    "intime": "ed_intime",
    "outtime": "ed_outtime"
})

# 시간형으로 변환
df_ed["ed_intime"] = pd.to_datetime(df_ed["ed_intime"], errors="coerce")
df_ed["ed_outtime"] = pd.to_datetime(df_ed["ed_outtime"], errors="coerce")

df_ed.head()


Unnamed: 0,subject_id,hadm_id,ed_stay_id,ed_intime,ed_outtime,arrival_transport,disposition
0,10000032,22595853.0,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,AMBULANCE,ADMITTED
1,10000032,22841357.0,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,AMBULANCE,ADMITTED
2,10000032,25742920.0,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,AMBULANCE,ADMITTED
3,10000032,29079034.0,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,AMBULANCE,HOME
4,10000032,29079034.0,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,AMBULANCE,ADMITTED


In [64]:
# Admissions
df_adm = pd.read_sql_query("""
SELECT subject_id, hadm_id, admittime, admission_type
FROM admissions;
""", conn)

df_adm["admittime"] = pd.to_datetime(df_adm["admittime"], errors="coerce")

# 응급입원 플래그
df_adm["is_emergency_admission"] = df_adm["admission_type"].isin(
    ["EMERGENCY", "URGENT"]
).astype(int)

df_adm.head()


Unnamed: 0,subject_id,hadm_id,admittime,admission_type,is_emergency_admission
0,10000032,22595853,2180-05-06 22:23:00,URGENT,1
1,10000032,22841357,2180-06-26 18:27:00,EW EMER.,0
2,10000032,25742920,2180-08-05 23:44:00,EW EMER.,0
3,10000032,29079034,2180-07-23 12:35:00,EW EMER.,0
4,10000068,25022803,2160-03-03 23:16:00,EU OBSERVATION,0


In [65]:
# ICU stays
df_icu = pd.read_sql_query("""
SELECT subject_id, hadm_id,
       stay_id AS icu_stay_id,
       intime AS icu_intime,
       outtime AS icu_outtime
FROM icustays;
""", conn)

df_icu["icu_intime"] = pd.to_datetime(df_icu["icu_intime"], errors="coerce")
df_icu["icu_outtime"] = pd.to_datetime(df_icu["icu_outtime"], errors="coerce")

df_icu.head()


Unnamed: 0,subject_id,hadm_id,icu_stay_id,icu_intime,icu_outtime
0,10000032,29079034,39553978,2180-07-23 14:00:00,2180-07-23 23:50:47
1,10000690,25860671,37081114,2150-11-02 19:37:00,2150-11-06 17:03:17
2,10000980,26913865,39765666,2189-06-27 08:42:00,2189-06-27 20:38:27
3,10001217,24597018,37067082,2157-11-20 19:18:02,2157-11-21 22:08:00
4,10001217,27703517,34592300,2157-12-19 15:42:24,2157-12-20 14:27:41


In [66]:
# ED를 기준으로 admissions / ICU 붙이기
df_base = (
    df_ed
    .merge(df_adm[["subject_id","hadm_id","admittime","is_emergency_admission"]],
           on=["subject_id","hadm_id"], how="left")
    .merge(df_icu[["subject_id","hadm_id","icu_stay_id","icu_intime"]],
           on=["subject_id","hadm_id"], how="left")
)

print("베이스 코호트 shape:", df_base.shape)
df_base.head()


베이스 코호트 shape: (428410, 11)


Unnamed: 0,subject_id,hadm_id,ed_stay_id,ed_intime,ed_outtime,arrival_transport,disposition,admittime,is_emergency_admission,icu_stay_id,icu_intime
0,10000032,22595853.0,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,AMBULANCE,ADMITTED,2180-05-06 22:23:00,1.0,,NaT
1,10000032,22841357.0,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,AMBULANCE,ADMITTED,2180-06-26 18:27:00,0.0,,NaT
2,10000032,25742920.0,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,AMBULANCE,ADMITTED,2180-08-05 23:44:00,0.0,,NaT
3,10000032,29079034.0,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,AMBULANCE,HOME,2180-07-23 12:35:00,0.0,39553978.0,2180-07-23 14:00:00
4,10000032,29079034.0,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,AMBULANCE,ADMITTED,2180-07-23 12:35:00,0.0,39553978.0,2180-07-23 14:00:00
