In [12]:
import psycopg2
from dfply import *
import pandas as pd
import logging
import sys
import os
from pathlib import Path
from sshtunnel import SSHTunnelForwarder

# 현재 노트북 파일의 상위 디렉토리에 있는 src 디렉토리의 경로를 sys.path에 추가
module_path = Path('../src').resolve()
if module_path not in sys.path:
    sys.path.append(str(module_path))

# Logging config
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 소스코드(src)
import src.data_extraction.access_database as db
import src.data_extraction.filter_adult_patients as fap
import src.data_extraction.filter_ventilation_events as fve
from src.utils import utils


In [20]:
# 데이터 불러오기
adults_icu = pd.read_csv('../outputs/adults_icu.csv')   # 성인 환자 정보
intubation_extubation_orgn = pd.read_csv('../outputs/intubation_extubation_raw20240127.csv', \
                                    parse_dates=['intubationtime', 'extubationtime', \
                                                 'admittime', 'dischtime', 'deathtime'])   # 삽관 발관 정보


intubation_all = pd.read_csv('../outputs/intubation_data.csv', index_col=0, parse_dates=['intubationtime'])
extubation_all = pd.read_csv('../outputs/extubation_data.csv', index_col=0, parse_dates=['extubationtime'])

print(intubation_all.shape), print(extubation_all.shape)

(8265, 6)
(22682, 7)


(None, None)

In [14]:
intubation_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8265 entries, 0 to 8443
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   subject_id      8265 non-null   int64         
 1   hadm_id         8265 non-null   int64         
 2   stay_id         8265 non-null   int64         
 3   intubationtime  8265 non-null   datetime64[ns]
 4   itemid          8265 non-null   int64         
 5   patientweight   8265 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 452.0 KB


In [15]:
extubation_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22682 entries, 0 to 23054
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   subject_id       22682 non-null  int64         
 1   hadm_id          22682 non-null  int64         
 2   stay_id          22682 non-null  int64         
 3   extubationtime   22682 non-null  datetime64[ns]
 4   itemid           22682 non-null  int64         
 5   patientweight    22682 non-null  float64       
 6   extubationcause  22682 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 1.4+ MB


In [36]:
extubation_all.extubationcause.value_counts()

extubationcause
Planned Extubation                              21918
Unplanned Extubation (patient-initiated)          709
Unplanned Extubation (non-patient initiated)       55
Name: count, dtype: int64

In [33]:
intubation_all.subject_id.nunique(), extubation_all.subject_id.nunique()

(6709, 19267)

In [34]:
intubation_extubation_orgn.subject_id.nunique()

6709

In [18]:
## 데이터 처리: 삽관/발관 데이터 (filter_ventilation_events)

# 삽관 발관 테이블 결합
intubation_extubation = fve.join_ventilation_and_rename(intubation_all, extubation_all)
intubation_extubation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   subject_id       10992 non-null  int64         
 1   hadm_id          10992 non-null  int64         
 2   int_stayid       10992 non-null  int64         
 3   intubationtime   10992 non-null  datetime64[ns]
 4   int_itemid       10992 non-null  int64         
 5   int_weight       10992 non-null  float64       
 6   ext_stayid       9446 non-null   float64       
 7   extubationtime   9446 non-null   datetime64[ns]
 8   ext_itemid       9446 non-null   float64       
 9   ext_weight       9446 non-null   float64       
 10  extubationcause  9446 non-null   object        
dtypes: datetime64[ns](2), float64(4), int64(4), object(1)
memory usage: 944.8+ KB


In [22]:
intubation_extubation_orgn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   subject_id       10992 non-null  int64         
 1   hadm_id          10992 non-null  int64         
 2   int_stayid       10992 non-null  int64         
 3   admittime        10992 non-null  datetime64[ns]
 4   intubationtime   10992 non-null  datetime64[ns]
 5   int_itemid       10992 non-null  int64         
 6   int_weight       10992 non-null  float64       
 7   ext_stayid       9446 non-null   float64       
 8   extubationtime   9446 non-null   datetime64[ns]
 9   ext_itemid       9446 non-null   float64       
 10  ext_weight       9446 non-null   float64       
 11  extubationcause  9446 non-null   object        
 12  dischtime        10992 non-null  datetime64[ns]
 13  deathtime        2968 non-null   datetime64[ns]
dtypes: datetime64[ns](5), float64(4), int6

In [26]:
intubation_extubation_orgn.extubationcause.value_counts()

extubationcause
Planned Extubation                              8788
Unplanned Extubation (patient-initiated)         591
Unplanned Extubation (non-patient initiated)      67
Name: count, dtype: int64

In [35]:
intubation_extubation_orgn

Unnamed: 0,subject_id,hadm_id,int_stayid,admittime,intubationtime,int_itemid,int_weight,ext_stayid,extubationtime,ext_itemid,ext_weight,extubationcause,dischtime,deathtime
0,10001884,26184834,37510196,2131-01-07 20:39:00,2131-01-11 04:30:00,224385,65.0,37510196.0,2131-01-12 17:40:00,227194.0,65.0,Planned Extubation,2131-01-20 05:15:00,2131-01-20 05:15:00
1,10003400,23559586,38383343,2137-08-04 00:07:00,2137-08-17 21:21:00,224385,90.5,38383343.0,2137-08-21 15:40:00,227194.0,99.6,Planned Extubation,2137-09-02 17:05:00,2137-09-02 17:05:00
2,10004401,27939719,31202136,2144-04-11 03:31:00,2144-04-11 05:03:00,224385,120.0,,NaT,,,,2144-04-13 17:31:00,NaT
3,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-27 19:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,NaT
4,10004401,29988601,32773003,2144-01-23 07:58:00,2144-01-30 13:00:00,224385,76.0,32773003.0,2144-01-30 12:30:00,227194.0,76.0,Planned Extubation,2144-02-06 11:45:00,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10987,19997367,20617667,35616526,2126-04-20 07:15:00,2126-05-05 18:20:00,224385,59.0,35616526.0,2126-05-02 16:30:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,NaT
10988,19997367,20617667,35616526,2126-04-20 07:15:00,2126-05-05 18:20:00,224385,59.0,35616526.0,2126-05-08 17:29:00,227194.0,59.0,Planned Extubation,2126-05-19 14:15:00,NaT
10989,19999068,21606769,30143796,2161-08-24 04:10:00,2161-08-25 15:34:00,224385,55.8,30143796.0,2161-08-28 13:35:00,227194.0,55.8,Planned Extubation,2161-09-02 19:00:00,NaT
10990,19999442,26785317,32336619,2148-11-19 10:00:00,2148-11-19 19:00:00,224385,107.5,32336619.0,2148-11-20 14:15:00,227194.0,107.5,Planned Extubation,2148-12-04 16:25:00,NaT


In [28]:
((intubation_extubation_orgn.intubationtime.notna()) & (intubation_extubation_orgn.extubationtime.isna())).sum()

1546

In [31]:
# extubation null: potential causes?

intubation_extubation_orgn[intubation_extubation_orgn.extubationtime.isna()].subject_id.nunique()

1467

In [None]:

# 입원 데이터 결합
intubation_extubation = fve.join_admissions(intubation_extubation, admissions)

print(f'Processed intubation_extubation: {intubation_extubation.shape}')   # (10992, 14)
print(intubation_extubation.columns)

# 중복치/근접치 제거 리포트 출력
fve.report_filtering_stats('intubation', intubation_all, intubation_data, time_diff)
fve.report_filtering_stats('extubation', extubation_all, extubation_data, time_diff)

print(f'intubation_data: {intubation_data.shape}')
print(f'extubation_data: {extubation_data.shape}')
print(f'merged ventilation data: {intubation_extubation.shape}')

In [None]:

intubation_extubation = util.create_stay_id(intubation_extubation)   # stay id 만들어 넣기

ventilation = pd.read_csv('./data/ventilation.csv', parse_dates=['starttime', 'endtime'])   # 결측치 처리 위한 vent 테이블
ventilation_df = ventilation[ventilation.ventilation_status == 'InvasiveVent']   # Invasive 만 사용

print(f'adults_icu: {adults_icu.shape}')   # (73181, 13)
print(f'intubation_extubation: {intubation_extubation.shape}')   # (10992, 14)
print(f'ventilation_df: {ventilation_df.shape}')   # (34800, 5)