## Demographics (단일변수) 붙이기
- Demographics:
    - age (hadm id)
    - height
    - weight (보류: admit, daily 어떻게 쓸지 결정 필요)
    - gender


In [2]:
import numpy as np
import pandas as pd
from dfply import *
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import timedelta
import time
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import sys
from sshtunnel import SSHTunnelForwarder
import logging
sys.path.append('/Users/sdc/Documents/bida_lab/db/P1_extubationfailure/서대철')

# 데이터 정제 시 필요한 모듈들
import src.subjectlist_alignment.pairing as pairing   # 삽관 발관 페어링 관련 모듈
from src.subjectlist_alignment import reintubation as rnt   # 재삽관 시간 계산 관련 모듈
from src.subjectlist_alignment import imputation as imp   # 결측치 대체 관련 모듈
from src.subjectlist_alignment import subject_classification as cls

import src.utils.utils as utils   # 기타 유틸리티 모듈
import src.data_extraction.access_database as db   # DB 연결 관련 모듈

### subjectlist 가져오기

In [3]:
subjectlist1 = pd.read_csv('../outputs/subjectlist1.csv', parse_dates=['admittime','intubationtime','dischtime', 'deathtime','extubationtime','reintubation_eventtime'])
sub1list = subjectlist1 >> select('subject_id','hadm_id','stay_id')
sub1list = sub1list.drop_duplicates(subset='stay_id')   # remove dup
sub1list

Unnamed: 0,subject_id,hadm_id,stay_id
0,10001884,26184834,37510196
1,10003400,23559586,38383343
2,10004401,27939719,31202136
3,10004401,29988601,32773003
5,10005817,28661809,31316840
...,...,...,...
8995,19995595,21784060,34670930
8997,19997367,20617667,35616526
9000,19999068,21606769,30143796
9001,19999442,26785317,32336619


### DB 연결 후 테이블 가져오기

In [4]:
# 1. 파라미터 설정

# 데이터 저장위치
# output_dir = './data'   # 수정 쌤 디렉토리에서 사용
output_dir = '../outputs'   # 내 디렉에서 사용


# SSH 연결 설정
ssh_config = {
    'ssh_ip': '210.91.223.248',
    'ssh_port': 35430,
    'ssh_username': "medicalai",
    'ssh_password': "medicalai731!",
    'remote_bind_address': ('localhost', 35432)  # 데이터베이스 호스트 및 포트
}

# 데이터베이스 연결 설정
db_config = {
    'database': 'mimiciv',
    'user': 'mai_admin',
    'password': 'admin1q2w!@',
    'host': 'localhost',  # SSH 터널을 사용하면, 이 값은 'localhost'가 됩니다
    'port': 35432  # 이 값은 SSH 터널 로컬 포트 바인딩에 의해 대체됩니다
}


# 테이블 쿼리 설정
tables_query = {
    'patients': "select * from mimiciv_hosp.patients;",   # 환자 정보 (gender 여기서 추출)
    'age': "select * from mimiciv_derived.age;",   # 나이
    'height': "select * from mimiciv_derived.height;",   # 키
    'weight': "select * from mimiciv_derived.weight_durations;"   # 체중  (admit, daily)
}


db.print_config_info(db_config, tables_query)

--------- Database Configuration ---------
Database Name: mimiciv
User: mai_admin
Password: ***********
Host: localhost
Port: 35432

--------- SQL Queries for Required Tables ---------
patients: select * from mimiciv_hosp.patients;
age: select * from mimiciv_derived.age;
height: select * from mimiciv_derived.height;
weight: select * from mimiciv_derived.weight_durations;



In [5]:
# 2. db 접속, 테이블 추출, 'dataframes' 라는 딕셔너리에 테이블 저장

try:
    conn, tunnel = db.connect_to_database_via_ssh(db_config, ssh_config)
    if conn is not None:
        # 데이터베이스 작업 수행
        dataframes = db.retrieve_data(conn, tables_query)
    else:
        logging.error("Failed to connect to the database.")
except Exception as e:
    logging.error(f"An error occurred: {e}")
finally:
    if conn is not None:
        db.disconnect_database(conn)


# 데이터프레임 변수로 저장
## Demographics
patients = dataframes['patients']
age = dataframes['age']
height = dataframes['height']
weight = dataframes['weight']


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.2p1)
INFO:paramiko.transport:Authentication (password) successful!
INFO:root:SSH TUNNEL ESTABLISHED...
INFO:root:mimiciv DATABASE CONNECTED VIA SSH.
INFO:root:Retrieved patients: (299712, 6)
INFO:root:Retrieved age: (431231, 6)
INFO:root:Retrieved height: (33474, 4)
INFO:root:Retrieved weight: (272447, 5)
INFO:root:DATABASE CONNCETION CLOSED.


patients: (299712, 6)
age: (431231, 6)
height: (33474, 4)
weight: (272447, 5)


In [6]:
patients

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000068,F,19,2160,2008 - 2010,
3,10000084,M,72,2160,2017 - 2019,2161-02-13
4,10000102,F,27,2136,2008 - 2010,
...,...,...,...,...,...,...
299707,19999828,F,46,2147,2017 - 2019,
299708,19999829,F,28,2186,2008 - 2010,
299709,19999840,M,58,2164,2008 - 2010,2164-09-17
299710,19999914,F,49,2158,2017 - 2019,


In [9]:
# gender 결합 (subject_id 기준)
sub1list_gender = pd.merge(sub1list, patients[['subject_id', 'gender']], on=['subject_id'], how='left')
sub1list_gender

Unnamed: 0,subject_id,hadm_id,stay_id,gender
0,10001884,26184834,37510196,F
1,10003400,23559586,38383343,F
2,10004401,27939719,31202136,M
3,10004401,29988601,32773003,M
4,10005817,28661809,31316840,M
...,...,...,...,...
7226,19995595,21784060,34670930,M
7227,19997367,20617667,35616526,F
7228,19999068,21606769,30143796,M
7229,19999442,26785317,32336619,M


In [12]:
age

Unnamed: 0,subject_id,hadm_id,admittime,anchor_age,anchor_year,age
0,10000084,23052089,2160-11-21 01:56:00,72,2160,72.89004154931676958169
1,10000084,29888819,2160-12-28 05:11:00,72,2160,72.99171500600210879971
2,10000764,27897940,2132-10-14 23:31:00,86,2132,86.78846315897709220492
3,10000886,21927847,2178-05-08 08:07:00,18,2178,18.34864061209949689369
4,10000980,20897796,2193-08-15 01:01:00,73,2186,80.6197216122765485
...,...,...,...,...,...,...
431226,19999784,29355057,2119-10-17 10:28:00,57,2119,57.79245024151414982699
431227,19999784,29889147,2120-10-25 09:43:00,57,2119,58.8163433041958787
431228,19999784,29956342,2121-01-31 00:00:00,57,2119,59.0835500846014423
431229,19999828,25744818,2149-01-08 16:44:00,46,2147,48.0224870694559285


In [14]:
# age 결합 (hadm_id 기준)
sub1list_age = pd.merge(sub1list_gender, age[['subject_id', 'hadm_id', 'age']], on=['subject_id', 'hadm_id'], how='left')
sub1list_age

Unnamed: 0,subject_id,hadm_id,stay_id,gender,age
0,10001884,26184834,37510196,F,77.0182958604614657
1,10003400,23559586,38383343,F,75.5894143091734004
2,10004401,27939719,31202136,M,85.2749424430316825
3,10004401,29988601,32773003,M,85.0591551476676955
4,10005817,28661809,31316840,M,69.0087243526210020
...,...,...,...,...,...
7226,19995595,21784060,34670930,M,68.79373363718058468388
7227,19997367,20617667,35616526,F,63.29925934950891007423
7228,19999068,21606769,30143796,M,63.64388435916765079348
7229,19999442,26785317,32336619,M,43.8841608212272046


In [15]:
height 

Unnamed: 0,subject_id,stay_id,charttime,height
0,10000032,39553978,2180-07-23 12:36:00,152.00
1,10001725,31205490,2110-04-11 15:52:00,157.00
2,10001884,37510196,2131-01-11 04:20:00,157.00
3,10002013,39060235,2160-05-18 10:00:00,157.00
4,10002428,33987268,2156-04-12 16:24:00,150.00
...,...,...,...,...
33469,19999287,37692584,2197-07-26 03:31:00,163.00
33470,19999287,35165301,2197-08-03 20:58:00,165.00
33471,19999297,37364566,2162-08-16 05:48:00,165.00
33472,19999442,32336619,2148-11-19 14:23:00,193.00


In [17]:
# height 결합 (stay id 기준) - 결측치 있음
sub1list_height = pd.merge(sub1list_age, height[['subject_id', 'stay_id', 'height']], on=['subject_id', 'stay_id'], how='left')
sub1list_height

Unnamed: 0,subject_id,hadm_id,stay_id,gender,age,height
0,10001884,26184834,37510196,F,77.0182958604614657,157.00
1,10003400,23559586,38383343,F,75.5894143091734004,165.00
2,10004401,27939719,31202136,M,85.2749424430316825,
3,10004401,29988601,32773003,M,85.0591551476676955,168.00
4,10005817,28661809,31316840,M,69.0087243526210020,175.00
...,...,...,...,...,...,...
7226,19995595,21784060,34670930,M,68.79373363718058468388,163.00
7227,19997367,20617667,35616526,F,63.29925934950891007423,147.00
7228,19999068,21606769,30143796,M,63.64388435916765079348,170.00
7229,19999442,26785317,32336619,M,43.8841608212272046,193.00


In [26]:
weight   # weight_type = ['admit', 'daily']. 일단은 admit 기준으로 테이블 결합.

Unnamed: 0,stay_id,starttime,endtime,weight,weight_type
0,30000153,2174-09-29 10:09:00,2174-09-29 16:00:00,70.0,admit
1,30000153,2174-09-29 16:00:00,2174-10-01 05:26:10,73.0,daily
2,30000213,2162-06-21 03:38:00,2162-06-22 00:00:00,84.7,admit
3,30000213,2162-06-22 00:00:00,2162-06-22 22:52:48,73.7,daily
4,30000484,2136-01-14 15:23:32,2136-01-17 06:53:08,68.5,admit
...,...,...,...,...,...
272442,39989059,2163-10-19 12:33:55,2163-10-19 15:00:00,103.0,daily
272443,39990748,2112-12-12 17:36:53,2112-12-12 20:00:00,46.0,daily
272444,39991309,2158-06-07 08:24:12,2158-06-08 04:00:00,87.8,daily
272445,39997370,2134-03-14 15:25:06,2134-03-16 05:00:00,73.1,daily


In [30]:
# weight 결합 (admit type 만 결합. stay_id 기준) - 결측치 있음
weight_admit = weight[weight.weight_type == 'admit']

sub1list_weight = pd.merge(sub1list_height, weight_admit[['stay_id', 'weight', 'weight_type']], on=['stay_id'], how='left')
sub1list_weight

Unnamed: 0,subject_id,hadm_id,stay_id,gender,age,height,weight,weight_type
0,10001884,26184834,37510196,F,77.0182958604614657,157.00,65.0,admit
1,10003400,23559586,38383343,F,75.5894143091734004,165.00,99.6,admit
2,10004401,27939719,31202136,M,85.2749424430316825,,80.4,admit
3,10004401,29988601,32773003,M,85.0591551476676955,168.00,76.0,admit
4,10005817,28661809,31316840,M,69.0087243526210020,175.00,88.8,admit
...,...,...,...,...,...,...,...,...
7226,19995595,21784060,34670930,M,68.79373363718058468388,163.00,94.7,admit
7227,19997367,20617667,35616526,F,63.29925934950891007423,147.00,59.0,admit
7228,19999068,21606769,30143796,M,63.64388435916765079348,170.00,55.8,admit
7229,19999442,26785317,32336619,M,43.8841608212272046,193.00,107.5,admit


In [33]:
# 이 테이블 기준으로 feature 붙이기 작업할거임. 
file_name = output_dir + '/' + 'sub1list.csv'
sub1list_weight.to_csv(file_name, index=False, encoding='utf-8-sig')