In [36]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append('./utils')
from env import load_env_file
from stats import plot_distribution
from tqdm import tqdm

DATA_DIR = load_env_file()

load env file
  root dir:
    /Users/k/Repo/gp-ibd
  current system:
    Darwin
  loaded env file:
    .env.darwin
  loaded data dir:
    /Users/k/Nutstore Files/毕设-EHR/DB
done.


In [37]:
df = pd.read_csv(os.path.join(DATA_DIR, 'outcome', 'admission_stay.csv'))
for _column in ['admittime', 'dischtime', 'edregtime', 'edouttime']:
    # 入院 出院 进入Ed 离开Ed
    df[_column] = pd.to_datetime(df[_column])
    
df['los_ad'] = df['dischtime'] - df['admittime']
df['los_dp'] = df['edouttime'] - df['edregtime']

df.head(2)

Unnamed: 0,subject_id,hadm_id,admission_type,anchor_year,admittime,dischtime,edregtime,edouttime,deathtime,los_ad,los_dp
0,10098672,21229395.0,EW EMER.,2140,2142-05-16 04:04:00,2142-05-23 14:15:00,2142-05-15 15:49:00,2142-05-16 05:45:00,,7 days 10:11:00,0 days 13:56:00
1,10098672,21259834.0,EW EMER.,2140,2141-04-13 17:30:00,2141-04-17 19:17:00,2141-04-13 12:17:00,2141-04-13 18:47:00,,4 days 01:47:00,0 days 06:30:00


In [38]:
stats = df.groupby('subject_id').agg({
    'los_ad': 'sum',
    'los_dp': 'sum',
})

stats.head()

Unnamed: 0_level_0,los_ad,los_dp
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10001186,16 days 12:06:00,0 days 18:33:00
10007174,4 days 06:26:00,1 days 15:52:00
10018852,12 days 16:21:00,0 days 10:07:00
10024331,97 days 14:06:00,6 days 07:42:00
10025647,59 days 15:40:00,4 days 22:56:00


In [39]:
df_demo = pd.read_csv(os.path.join(DATA_DIR, 'complication', 'Complications_Patients.csv'))


# df_demography = df_demography.loc[p_lists['only_cd'], :]
stats_demo = df_demo.groupby('subject_id').agg({
    'gender': 'first',
    'anchor_age': 'first',
})

stats_demo.head()

Unnamed: 0_level_0,gender,anchor_age
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10001186,F,46
10007174,M,70
10018852,M,19
10024331,M,72
10025647,M,83


In [21]:
df_merge = pd.merge(stats, stats_demo, on='subject_id')

def to_group(age) -> int:
    if age <= 35:
        return 0
    elif age <= 55:
        return 1
    return 2

df_merge['age_group'] = df_merge.anchor_age.apply(to_group)
df_merge.head()

Unnamed: 0_level_0,los_ad,los_dp,gender,anchor_age,age_group
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10001186,16 days 12:06:00,0 days 18:33:00,F,46,1
10007174,4 days 06:26:00,1 days 15:52:00,M,70,2
10018852,12 days 16:21:00,0 days 10:07:00,M,19,0
10024331,97 days 14:06:00,6 days 07:42:00,M,72,2
10025647,59 days 15:40:00,4 days 22:56:00,M,83,2


In [26]:
# By Age
df_merge.groupby('age_group').agg({
    'los_ad': ['mean', 'std'],
    'los_dp': ['mean', 'std'],
})

Unnamed: 0_level_0,los_ad,los_ad,los_dp,los_dp
Unnamed: 0_level_1,mean,std,mean,std
age_group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,103 days 09:35:34.858096828,637 days 20:18:09.922373112,6 days 05:17:21.035058430,54 days 06:53:55.644382717
1,123 days 20:27:25.898734178,984 days 17:56:47.681952784,6 days 12:21:50.810126582,47 days 14:39:16.641040697
2,84 days 16:34:49.844357977,313 days 04:49:15.526305332,3 days 23:06:30.291828793,16 days 15:22:28.737098979


In [27]:
# By Gender
df_merge.groupby('gender').agg({
    'los_ad': ['mean', 'std'],
    'los_dp': ['mean', 'std'],
})

Unnamed: 0_level_0,los_ad,los_ad,los_dp,los_dp
Unnamed: 0_level_1,mean,std,mean,std
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
F,101 days 09:00:07.600596125,551 days 21:33:18.579145832,5 days 05:37:33.263785394,39 days 12:37:19.839075658
M,103 days 01:35:57.767441860,808 days 02:50:10.606630528,5 days 12:13:01.451162790,40 days 07:01:55.318754470


# ICU Stay

In [35]:
df_icu = pd.read_csv(os.path.join(DATA_DIR, 'outcome', 'icu_stay.csv'))
df_icu

Unnamed: 0,subject_id,stay_id,los,dod
0,10098672,,,
1,10303503,37900721.0,0.316667,
2,10303503,30774040.0,1.263980,
3,10303503,33780253.0,1.961840,
4,10312715,,,
...,...,...,...,...
9341,10556676,36225739.0,10.129100,
9342,10556676,38407512.0,0.985706,
9343,10556676,39626730.0,8.517510,
9344,14458834,,,2162-03-15


In [42]:
stats_icu = df_icu.groupby('subject_id').agg({
    'los': 'sum'
})
stats_icu.head()

Unnamed: 0_level_0,los
subject_id,Unnamed: 1_level_1
10001186,0.0
10007174,0.0
10018852,0.0
10024331,8.21142
10025647,3.9362


In [43]:
df_merge_icu = pd.merge(stats_icu, stats_demo, on='subject_id')

def to_group(age) -> int:
    if age <= 35:
        return 0
    elif age <= 55:
        return 1
    return 2

df_merge_icu['age_group'] = df_merge.anchor_age.apply(to_group)
df_merge_icu.head()

Unnamed: 0_level_0,los,gender,anchor_age,age_group
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10001186,0.0,F,46,1
10007174,0.0,M,70,2
10018852,0.0,M,19,0
10024331,8.21142,M,72,2
10025647,3.9362,M,83,2


In [45]:
# By Age
df_merge_icu.groupby('age_group').agg({
    'los': ['mean', 'std'],
})

Unnamed: 0_level_0,los,los
Unnamed: 0_level_1,mean,std
age_group,Unnamed: 1_level_2,Unnamed: 2_level_2
0,1.253913,6.865442
1,9.96611,114.719507
2,8.219719,39.48591


In [46]:
# By Age
df_merge_icu.groupby('gender').agg({
    'los': ['mean', 'std'],
})

Unnamed: 0_level_0,los,los
Unnamed: 0_level_1,mean,std
gender,Unnamed: 1_level_2,Unnamed: 2_level_2
F,6.40461,36.764249
M,7.887633,97.585046
