In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

from utils import load_env_file, set_mpl_configs
from utils import leave_percentile, distribution_analysis

load_env_file()
set_mpl_configs()

DATA_DIR = os.getenv('DATA_DIR')
print('DATA_DIR: {}'.format(DATA_DIR))

load env file
  root dir:
    /Users/k/Repo/gp-ibd
  current system:
    Darwin
  load .env.darwin
  loaded data dir:
    /Users/k/Nutstore Files/毕设-EHR/DB
done.
set matplotlib configs
  font family:
    ['Times New Roman']
done.
DATA_DIR: /Users/k/Nutstore Files/毕设-EHR/DB


In [2]:
df_ibd = pd.read_csv(os.path.join(DATA_DIR, 'complication', 'Complications_Patients.csv'))
df_ibd.head(10)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,subject_id.1,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10098672,21229395,1,9975,9,10098672,M,61,2140,2011 - 2013,
1,10098672,21229395,2,5990,9,10098672,M,61,2140,2011 - 2013,
2,10098672,21229395,3,5849,9,10098672,M,61,2140,2011 - 2013,
3,10098672,21229395,4,5559,9,10098672,M,61,2140,2011 - 2013,
4,10098672,21229395,5,5793,9,10098672,M,61,2140,2011 - 2013,
5,10098672,21229395,6,E8788,9,10098672,M,61,2140,2011 - 2013,
6,10098672,21229395,7,0413,9,10098672,M,61,2140,2011 - 2013,
7,10098672,21229395,8,0416,9,10098672,M,61,2140,2011 - 2013,
8,10098672,21229395,9,5853,9,10098672,M,61,2140,2011 - 2013,
9,10098672,21229395,10,2809,9,10098672,M,61,2140,2011 - 2013,


In [3]:
# is UC or CD
def is_ibd(some_icd_codes: [str]) -> bool:
    for icd_code in some_icd_codes:
        if (icd_code.startswith('555') or icd_code.startswith('556')) and len(icd_code) == 4:
            return True
    return False


def is_cd(some_icd_codes: [str]) -> bool:
    for icd_code in some_icd_codes:
        if icd_code in ['5550', '5551', '5552', '5559']:
            return True
    return False

def is_uc(some_icd_codes: [str]) -> bool:
    for icd_code in some_icd_codes:
        if icd_code in ['5560', '5561', '5562', '5563', '5564', '5565', '5566', '5568', '5569']:
            return True
    return False

In [4]:
stats = df_ibd.groupby('subject_id').agg({
    'gender': 'first',
    'anchor_age': 'first',
    'icd_code': [
        ('ibd', lambda x: is_ibd(x)), 
        ('cd', lambda x: is_cd(x)), 
        ('uc', lambda x: is_uc(x)),
    ]
})
# print(stats.shape)
stats.head()

Unnamed: 0_level_0,gender,anchor_age,icd_code,icd_code,icd_code
Unnamed: 0_level_1,first,first,ibd,cd,uc
subject_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
10001186,F,46,True,True,False
10007174,M,70,True,True,False
10018852,M,19,True,False,True
10024331,M,72,True,False,True
10025647,M,83,True,False,True


In [5]:
# 只得了一种 IBD 的患者
one_ibd_stats = stats[~((stats['icd_code']['uc'] == True) & (stats['icd_code']['cd'] == True))]
# 得了两种 IBD 的患者
both_ibd_stats = stats[((stats['icd_code']['uc'] == True) & (stats['icd_code']['cd'] == True))]
# 得了 UC 的患者
uc_ibd_stats = stats[(stats['icd_code']['uc'] == True)]
# 得了 CD 的患者
uc_ibd_stats = stats[(stats['icd_code']['cd'] == True)]
# 只得了 UC 的患者
only_uc_ibd_stats = stats[((stats['icd_code']['uc'] == True) & (stats['icd_code']['cd'] == False))]
# 只得了 CD 的患者
only_cd_ibd_stats = stats[((stats['icd_code']['uc'] == False) & (stats['icd_code']['cd'] == True))]


In [6]:
# 各个 Group 的 IBD Patients 数目
total_patients_nums = stats.shape[0]
print('Total IBD patients:\n  {}'.format(total_patients_nums))
print('  Only UC patiens:\n    {} ({:.2f}%)'.format(only_uc_ibd_stats.shape[0], (only_uc_ibd_stats.shape[0] / total_patients_nums*100)))
print('  Only CD patiens:\n    {} ({:.2f}%)'.format(only_cd_ibd_stats.shape[0], (only_cd_ibd_stats.shape[0] / total_patients_nums*100)))
print('  Both UC and CD patients:\n    {} ({:.2f}%)'.format(both_ibd_stats.shape[0], (both_ibd_stats.shape[0] / total_patients_nums*100)))

Total IBD patients:
  2417
  Only UC patiens:
    1052 (43.53%)
  Only CD patiens:
    1280 (52.96%)
  Both UC and CD patients:
    85 (3.52%)


In [7]:
# Store patiens list
import json

data = {
    'both_ibd': both_ibd_stats.index.tolist(),
    'only_uc': only_uc_ibd_stats.index.tolist(),
    'only_cd': only_cd_ibd_stats.index.tolist(),
}

with open('../data/ibd_demo.json', 'w') as f:
    json.dump(data, f)

# Baseline charateristics

In [8]:
with open('../data/ibd_demo.json', 'r') as f:
    data = json.loads(f.read())
    
both_ibd_patients: list = data['both_ibd']
only_uc_patients: list = data['only_uc']
only_cd_patients: list = data['only_cd']

patients_list: list = both_ibd_patients + only_uc_patients + only_cd_patients

## Gender

In [9]:
stats[stats.index.isin(both_ibd_stats)]

Unnamed: 0_level_0,gender,anchor_age,icd_code,icd_code,icd_code
Unnamed: 0_level_1,first,first,ibd,cd,uc
subject_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2


In [10]:
total_patients_nums = stats.shape[0]

for i, _patients_list in enumerate([stats.index.values, both_ibd_patients, only_cd_patients, only_uc_patients]):
    stats_tmp = stats[stats.index.isin(_patients_list)]
    print('-'*50)
    print('Group {}'.format(i))
    print('  Total patients num:\n    {}'.format(stats_tmp.shape[0]))
    print('  Male patients nums:\n    {} ({:.2f}%)'.format(stats_tmp[stats_tmp['gender']['first'] == 'M'].shape[0], (stats_tmp[stats_tmp['gender']['first'] == 'M'].shape[0] / stats_tmp.shape[0]*100)))

--------------------------------------------------
Group 0
  Total patients num:
    2417
  Male patients nums:
    1075 (44.48%)
--------------------------------------------------
Group 1
  Total patients num:
    85
  Male patients nums:
    40 (47.06%)
--------------------------------------------------
Group 2
  Total patients num:
    1280
  Male patients nums:
    541 (42.27%)
--------------------------------------------------
Group 3
  Total patients num:
    1052
  Male patients nums:
    494 (46.96%)


## Age

In [11]:
for i, _patients_list in enumerate([stats.index.values, both_ibd_patients, only_cd_patients, only_uc_patients]):
    print('Group {}'.format(i))
    
    stats_tmp = stats[stats.index.isin(_patients_list)]
    print('  Mean Age (Std):\n    {:.2f} +- {:.2f}'.format(stats_tmp['anchor_age']['first'].mean(), stats_tmp['anchor_age']['first'].std()))
    
    # break

Group 0
  Mean Age (Std):
    51.37 +- 18.55
Group 1
  Mean Age (Std):
    46.98 +- 19.19
Group 2
  Mean Age (Std):
    50.27 +- 17.99
Group 3
  Mean Age (Std):
    53.07 +- 19.03


## Weight

> Fetch data from OMR table

In [12]:
# Load OMR Information
df = pd.read_csv(os.path.join(DATA_DIR, 'omr', 'ibd_omr.csv'))
print(df.result_name.value_counts())
df = df[df['result_name'].isin(['Blood Pressure', 'Weight (Lbs)', 'BMI (kg/m2)', 'Height (Inches)'])]
df.head()

result_name
Blood Pressure                      49232
Weight (Lbs)                        40103
BMI (kg/m2)                         34600
Height (Inches)                     13767
Blood Pressure Sitting                 45
Blood Pressure Lying                   41
Blood Pressure Standing (1 min)        25
BMI                                    11
Blood Pressure Standing                 8
Blood Pressure Standing (3 mins)        6
Weight                                  3
Height                                  2
eGFR                                    1
Name: count, dtype: int64


Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
0,10098672,2140-04-18,1,Blood Pressure,120/74
1,10098672,2140-04-18,1,Weight (Lbs),156
2,10098672,2140-11-14,1,Blood Pressure,110/66
3,10098672,2140-11-14,1,BMI (kg/m2),21.6
4,10098672,2140-11-14,1,Height (Inches),66.25


In [13]:
weight_df = df[df['subject_id'].isin(patients_list) & (df['result_name'] == 'Weight (Lbs)')]
weight_df.loc[:, 'result_value'] = weight_df['result_value'].apply(float)

weight_df.head()

Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
1,10098672,2140-04-18,1,Weight (Lbs),156.0
5,10098672,2140-11-14,1,Weight (Lbs),135.0
8,10098672,2140-12-01,1,Weight (Lbs),159.0
11,10098672,2141-01-05,1,Weight (Lbs),170.0
15,10098672,2141-04-13,1,Weight (Lbs),164.0


In [14]:
weight_stats = weight_df.groupby('subject_id').agg({'result_value': ['mean', 'std', 'min', 'max']})
weight_stats.head()

Unnamed: 0_level_0,result_value,result_value,result_value,result_value
Unnamed: 0_level_1,mean,std,min,max
subject_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
10007174,166.995,3.273904,164.68,169.31
10018852,178.4,15.274161,160.0,192.0
10024331,248.115,19.141365,216.2,310.0
10025647,171.0,5.656854,167.0,175.0
10027407,185.0,,185.0,185.0


In [15]:
# Report Missing Values
for i, _patients_list in enumerate([both_ibd_patients, only_uc_patients, only_cd_patients]):
    missing_patiens_list = [_ for _ in _patients_list if _ not in weight_stats.index.values]
    print('*'*50)
    print('Group {}'.format(i))
    print('  Total nums:\n    {}'.format(len(_patients_list)))
    print('  Total missing patients:\n    {} ({:.2f}%)'.format(len(missing_patiens_list), (len(missing_patiens_list) / len(_patients_list)*100)))
    # print(missing_patiens_list)

**************************************************
Group 0
  Total nums:
    85
  Total missing patients:
    12 (14.12%)
**************************************************
Group 1
  Total nums:
    1052
  Total missing patients:
    287 (27.28%)
**************************************************
Group 2
  Total nums:
    1280
  Total missing patients:
    328 (25.62%)


In [16]:
for i, _patients_list in enumerate([stats.index.values, both_ibd_patients, only_cd_patients, only_uc_patients]):
    print('*'*50)
    print('Group {}'.format(i))
    stats_tmp = weight_stats[weight_stats.index.isin(_patients_list)]
    print('  Mean Weight (Std):\n    {:.2f} +- {:.2f}'.format(stats_tmp['result_value']['mean'].mean(), stats_tmp['result_value']['mean'].std()))

    # break

**************************************************
Group 0
  Mean Weight (Std):
    167.46 +- 59.33
**************************************************
Group 1
  Mean Weight (Std):
    161.26 +- 45.23
**************************************************
Group 2
  Mean Weight (Std):
    165.65 +- 43.23
**************************************************
Group 3
  Mean Weight (Std):
    170.31 +- 75.56


## BMI

In [17]:
bmi_df = df[df['subject_id'].isin(patients_list) & (df['result_name'] == 'BMI (kg/m2)')]
bmi_df.loc[:, 'result_value'] = bmi_df['result_value'].apply(float)

bmi_df.head()

Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
3,10098672,2140-11-14,1,BMI (kg/m2),21.6
7,10098672,2140-12-01,1,BMI (kg/m2),25.5
10,10098672,2141-01-05,1,BMI (kg/m2),27.2
13,10098672,2141-04-13,1,BMI (kg/m2),25.5
17,10098672,2141-05-04,1,BMI (kg/m2),25.9


In [18]:
bmi_stats = bmi_df.groupby('subject_id').agg({'result_value': ['mean', 'std', 'min', 'max']})
bmi_stats.head()

Unnamed: 0_level_0,result_value,result_value,result_value,result_value
Unnamed: 0_level_1,mean,std,min,max
subject_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
10007174,23.3,0.424264,23.0,23.6
10018852,26.0,,26.0,26.0
10024331,31.755556,2.968109,29.3,43.2
10027407,28.1,,28.1,28.1
10027957,24.760256,0.970117,21.1,26.6


In [19]:
# Report Missing Values
for i, _patients_list in enumerate([both_ibd_patients, only_uc_patients, only_cd_patients]):
    missing_patiens_list = [_ for _ in _patients_list if _ not in bmi_df.index.values]
    print('*'*50)
    print('Group {}'.format(i))
    print('  Total nums:\n    {}'.format(len(_patients_list)))
    print('  Total missing patients:\n    {} ({:.2f}%)'.format(len(missing_patiens_list), (len(missing_patiens_list) / len(_patients_list)*100)))
    # print(missing_patiens_list)

**************************************************
Group 0
  Total nums:
    85
  Total missing patients:
    85 (100.00%)
**************************************************
Group 1
  Total nums:
    1052
  Total missing patients:
    1052 (100.00%)
**************************************************
Group 2
  Total nums:
    1280
  Total missing patients:
    1280 (100.00%)


In [20]:
for i, _patients_list in enumerate([stats.index.values, both_ibd_patients, only_cd_patients, only_uc_patients]):
    print('*'*50)
    print('Group {}'.format(i))
    stats_tmp = bmi_stats[bmi_stats.index.isin(_patients_list)]
    print('  Mean BMI (Std):\n    {:.2f} +- {:.2f}'.format(stats_tmp['result_value']['mean'].mean(), stats_tmp['result_value']['mean'].std()))

    # break

**************************************************
Group 0
  Mean BMI (Std):
    27.26 +- 13.99
**************************************************
Group 1
  Mean BMI (Std):
    25.93 +- 6.40
**************************************************
Group 2
  Mean BMI (Std):
    26.81 +- 6.33
**************************************************
Group 3
  Mean BMI (Std):
    27.94 +- 20.08


# Vital Signs on Admission

## Heart rate

In [70]:
df = pd.read_csv(os.path.join(DATA_DIR, 'vital_signs_on_admission', 'HeartRate_Death.csv'))
df.head()

Unnamed: 0,subject_id,charttime,storetime,itemid,value,valuenum,dod
0,10303503,2155-11-08 07:00:00,2155-11-08 08:09:00,220045,74.0,74.0,
1,10303503,2155-11-08 08:00:00,2155-11-08 08:09:00,220045,75.0,75.0,
2,10303503,2155-11-08 09:00:00,2155-11-08 10:04:00,220045,69.0,69.0,
3,10303503,2155-11-08 10:00:00,2155-11-08 10:04:00,220045,74.0,74.0,
4,10303503,2155-11-08 03:49:00,2155-11-08 03:53:00,220045,89.0,89.0,


In [73]:
df.subject_id.value_counts()

subject_id
12468016    97200
11204646    33440
13158454    13450
10439781    13020
15219971    11034
            ...  
10303776       12
15814090       12
11955308       11
13126529       11
18002691        5
Name: count, Length: 683, dtype: int64

## Blood Pressure - Systolic Pressure

In [74]:
df = pd.read_csv(os.path.join(DATA_DIR, 'vital_signs_on_admission','BloodPressure_Death.csv'))
df.head()

Unnamed: 0,subject_id,chartdate,result_name,result_value,dod
0,10098672,2140-04-18,Blood Pressure,120/74,
1,10098672,2140-11-14,Blood Pressure,110/66,
2,10098672,2140-12-01,Blood Pressure,114/60,
3,10098672,2141-01-05,Blood Pressure,134/68,
4,10098672,2141-04-13,Blood Pressure,150/60,


In [75]:
df.subject_id.value_counts()

subject_id
12468016    4428
13158454    4425
10882916    3520
11965254    3268
16658776    2639
            ... 
19923690       1
19922993       1
12853724       1
11617211       1
14609551       1
Name: count, Length: 1744, dtype: int64

In [67]:
def handle_bp_string(bp_string: str) -> int:
    return int(bp_string.split('/')[1])

In [68]:
for i in range(df.shape[0]):
    df.iloc[i, 3] = handle_bp_string(df.iloc[i, 3])

df.head()

Unnamed: 0,subject_id,chartdate,result_name,result_value,dod
0,10098672,2140-04-18,Blood Pressure,74,
1,10098672,2140-11-14,Blood Pressure,66,
2,10098672,2140-12-01,Blood Pressure,60,
3,10098672,2141-01-05,Blood Pressure,68,
4,10098672,2141-04-13,Blood Pressure,60,


In [69]:
df.result_name.value_counts()

result_name
Blood Pressure    170792
Name: count, dtype: int64

## Blood Pressure - Diatolic Pressure

## Respiratory rate

In [76]:
df = pd.read_csv(os.path.join(DATA_DIR, 'vital_signs_on_admission', 'RespiratoryRate.csv'))
df.head()

Unnamed: 0,subject_id,itemid,label,value,valueuom,dod
0,10303503,220210,Respiratory Rate,13.0,insp/min,
1,10303503,220210,Respiratory Rate,12.0,insp/min,
2,10303503,220210,Respiratory Rate,12.0,insp/min,
3,10303503,220210,Respiratory Rate,15.0,insp/min,
4,10303503,220210,Respiratory Rate,18.0,insp/min,


In [77]:
df.subject_id.value_counts()

subject_id
12468016    90396
11204646    29062
13158454    12350
15219971    11304
10439781    10943
            ...  
11955308        9
12456798        9
13126529        9
12139354        7
18002691        4
Name: count, Length: 683, dtype: int64

## Temperature

In [83]:
df = pd.read_csv(os.path.join(DATA_DIR, 'vital_signs_on_admission', 'Temperature.csv'))
df.head()

Unnamed: 0,subject_id,itemid,label,value,valueuom,dod
0,10645926,223762,Temperature Celsius,38.2,°C,
1,10645926,223762,Temperature Celsius,38.2,°C,
2,10645926,223762,Temperature Celsius,37.5,°C,
3,10645926,223762,Temperature Celsius,37.3,°C,
4,10645926,223762,Temperature Celsius,37.5,°C,


In [81]:
df.subject_id.value_counts()

subject_id
14096194    1260
11204646    1111
13158454    1075
16310288     384
18468009     266
            ... 
17639084       2
10961036       2
16033463       2
11013939       2
16389191       1
Name: count, Length: 86, dtype: int64

## SpO2 Desat Limit

In [78]:
df = pd.read_csv(os.path.join(DATA_DIR, 'vital_signs_on_admission', 'SpO2DesatLimit.csv'))
df.head()

Unnamed: 0,subject_id,itemid,label,value,valueuom,dod
0,10303503,226253,SpO2 Desat Limit,88,%,
1,10303503,226253,SpO2 Desat Limit,88,%,
2,10303503,226253,SpO2 Desat Limit,85,%,
3,10303503,226253,SpO2 Desat Limit,88,%,
4,10303503,226253,SpO2 Desat Limit,88,%,


In [79]:
df.subject_id.value_counts()

subject_id
12468016    6768
11204646    2376
13158454    1075
10439781    1054
17340686     858
            ... 
18002691       1
11955308       1
17086205       1
13347660       1
14761038       1
Name: count, Length: 680, dtype: int64

# First Laboratory Tests after Admission

> subject_id: 12468016 非常奇怪的数据

In [21]:
df = pd.read_csv(os.path.join(DATA_DIR, 'Labevent','Labevents_Death.csv'))
df = df[df.subject_id != 12468016] # remove outlier
df.head()

  df = pd.read_csv(os.path.join(DATA_DIR, 'Labevent','Labevents_Death.csv'))


Unnamed: 0,subject_id,itemid,label,value,valuenum,valueuom,dod
0,10098672,51250,MCV,88.0,88.0,fL,
1,10098672,51200,Eosinophils,1.5,1.5,%,
2,10098672,51250,MCV,87.0,87.0,fL,
3,10098672,51200,Eosinophils,0.3,0.3,%,
4,10098672,51250,MCV,83.0,83.0,fL,


> 寻找哪些 label 是所有 subject_id 都发生的

In [22]:
tmp = df.groupby(['subject_id', 'label']).agg({
    'subject_id': 'count'
}).index.to_list()

res = pd.Series([_[1] for _ in tmp]).value_counts()

In [23]:
res[res > 2400]

White Blood Cells                2412
Hematocrit                       2412
Red Blood Cells                  2412
RDW                              2412
Platelet Count                   2412
MCV                              2412
MCHC                             2412
MCH                              2412
Hemoglobin                       2412
Glucose                          2409
Creatinine                       2407
Estimated GFR (MDRD equation)    2407
Urea Nitrogen                    2406
Sodium                           2402
Potassium                        2402
Chloride                         2402
Name: count, dtype: int64

> 计算每个 label 的均值与标准差

In [24]:
for _label in res[res > 2400].index.tolist():
    print('*'*100)
    print('label: {}'.format(_label))
    print('*'*100)
    for i ,_mode in enumerate([patients_list, both_ibd_patients, only_cd_patients, only_uc_patients]):
        filter_condition = (df.subject_id.isin(_mode)) \
                                & (df.label == _label)
        if i == 0:
            print(df[filter_condition].valueuom.value_counts())
            print('-'*100)
        print('Group: {} Mean: {:.4f} Std: {:.4f}'.format(i, df[filter_condition].valuenum.describe().values[1], df[filter_condition].valuenum.describe().values[2]))

    print('*'*100)
        
        
    # break

****************************************************************************************************
label: White Blood Cells
****************************************************************************************************
valueuom
K/uL    370415
Name: count, dtype: int64
----------------------------------------------------------------------------------------------------
Group: 0 Mean: 8.6156 Std: 4.9484
Group: 1 Mean: 8.7808 Std: 4.7245
Group: 2 Mean: 8.5463 Std: 4.5528
Group: 3 Mean: 8.6881 Std: 5.6996
****************************************************************************************************
****************************************************************************************************
label: Hematocrit
****************************************************************************************************
valueuom
%    389042
Name: count, dtype: int64
----------------------------------------------------------------------------------------------------
Group: 0 Mean: 32

# Treatment

In [84]:
df = pd.read_csv(os.path.join(DATA_DIR, 'treatment', 'input_events.csv'))
df.head()

Unnamed: 0,subject_id,starttime,endtime,itemid,label,totalamount,totalamountuom,dod
0,10098672,,,,,,,
1,10303503,2155-11-08 08:09:00,2155-11-08 10:09:00,222011.0,Magnesium Sulfate,100.0,ml,
2,10303503,2155-11-08 08:09:00,2155-11-08 10:09:00,225158.0,NaCl 0.9%,100.0,ml,
3,10303503,2155-11-08 10:05:00,2155-11-08 10:06:00,221833.0,Hydromorphone (Dilaudid),,,
4,10303503,2155-11-08 03:55:00,2155-11-08 03:56:00,226361.0,Pre-Admission/Non-ICU Intake,,ml,


In [88]:
_ = df.groupby(['subject_id', 'itemid']).agg({
    'label': 'first'
}).index.values

_ = pd.Series([x[1] for x in _]).value_counts()

_[_ > 200]

225158.0    635
220949.0    587
226452.0    537
225943.0    460
222011.0    447
225166.0    394
225975.0    373
225798.0    364
225828.0    357
227523.0    339
225910.0    310
221456.0    307
226089.0    297
225168.0    280
223258.0    263
226361.0    262
227522.0    261
222168.0    260
221744.0    254
221794.0    243
225799.0    229
225884.0    227
225154.0    218
226364.0    217
221906.0    209
221833.0    207
Name: count, dtype: int64

# Outcomes

## Mortality

In [25]:
df_ibd = pd.read_csv(os.path.join(DATA_DIR, 'complication', 'Complications_Patients.csv'))
df_ibd.head(10)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,subject_id.1,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10098672,21229395,1,9975,9,10098672,M,61,2140,2011 - 2013,
1,10098672,21229395,2,5990,9,10098672,M,61,2140,2011 - 2013,
2,10098672,21229395,3,5849,9,10098672,M,61,2140,2011 - 2013,
3,10098672,21229395,4,5559,9,10098672,M,61,2140,2011 - 2013,
4,10098672,21229395,5,5793,9,10098672,M,61,2140,2011 - 2013,
5,10098672,21229395,6,E8788,9,10098672,M,61,2140,2011 - 2013,
6,10098672,21229395,7,0413,9,10098672,M,61,2140,2011 - 2013,
7,10098672,21229395,8,0416,9,10098672,M,61,2140,2011 - 2013,
8,10098672,21229395,9,5853,9,10098672,M,61,2140,2011 - 2013,
9,10098672,21229395,10,2809,9,10098672,M,61,2140,2011 - 2013,


In [26]:
stats = df_ibd.groupby('subject_id').agg({
    'dod': 'first'
})

stats.head()

Unnamed: 0_level_0,dod
subject_id,Unnamed: 1_level_1
10001186,
10007174,
10018852,
10024331,2145-01-23
10025647,2181-06-16


In [27]:
for i ,_patients_list in enumerate([patients_list, both_ibd_patients, only_cd_patients, only_uc_patients]):
    stats_tmp = stats[stats.index.isin(_patients_list)]
    print('Group {}'.format(i))
    print('  Total patient nums: {}'.format(stats_tmp.shape[0]))
    print('  *Dead patient nums: {} ({:.2f}%)'.format(stats_tmp[stats_tmp['dod'].notnull()].shape[0], (stats_tmp[stats_tmp['dod'].notnull()].shape[0] / stats_tmp.shape[0]*100)))
    print('  Alive patient nums: {} ({:.2f}%)'.format(stats_tmp[stats_tmp['dod'].isnull()].shape[0], ((stats_tmp[stats_tmp['dod'].isnull()].shape[0] / stats_tmp.shape[0]*100))))
    # break

Group 0
  Total patient nums: 2417
  *Dead patient nums: 380 (15.72%)
  Alive patient nums: 2037 (84.28%)
Group 1
  Total patient nums: 85
  *Dead patient nums: 14 (16.47%)
  Alive patient nums: 71 (83.53%)
Group 2
  Total patient nums: 1280
  *Dead patient nums: 167 (13.05%)
  Alive patient nums: 1113 (86.95%)
Group 3
  Total patient nums: 1052
  *Dead patient nums: 199 (18.92%)
  Alive patient nums: 853 (81.08%)


## LOS on Admission

In [28]:
df = pd.read_csv(os.path.join(DATA_DIR, 'outcome', 'admission_stay.csv'))
for _column in ['admittime', 'dischtime', 'edregtime', 'edouttime']:
    # 入院 出院 进入Ed 离开Ed
    df[_column] = pd.to_datetime(df[_column])
    
df['los_ad'] = df['dischtime'] - df['admittime']
df['los_dp'] = df['edouttime'] - df['edregtime']

df.head()

Unnamed: 0,subject_id,hadm_id,admission_type,anchor_year,admittime,dischtime,edregtime,edouttime,deathtime,los_ad,los_dp
0,10098672,21229395.0,EW EMER.,2140,2142-05-16 04:04:00,2142-05-23 14:15:00,2142-05-15 15:49:00,2142-05-16 05:45:00,,7 days 10:11:00,0 days 13:56:00
1,10098672,21259834.0,EW EMER.,2140,2141-04-13 17:30:00,2141-04-17 19:17:00,2141-04-13 12:17:00,2141-04-13 18:47:00,,4 days 01:47:00,0 days 06:30:00
2,10098672,21921527.0,URGENT,2140,2141-06-29 22:55:00,2141-07-03 16:52:00,NaT,NaT,,3 days 17:57:00,NaT
3,10098672,24843492.0,SURGICAL SAME DAY ADMISSION,2140,2142-05-06 07:15:00,2142-05-11 13:39:00,NaT,NaT,,5 days 06:24:00,NaT
4,10098672,26570321.0,EW EMER.,2140,2142-01-20 23:24:00,2142-01-28 18:47:00,2142-01-20 19:54:00,2142-01-21 01:07:00,,7 days 19:23:00,0 days 05:13:00


In [29]:
stats = df.groupby('subject_id').agg({
    'los_ad': ['mean', 'min', 'max', 'count', 'first'],
    'los_dp': ['mean', 'min', 'max', 'count', 'first']
})

stats.head()

Unnamed: 0_level_0,los_ad,los_ad,los_ad,los_ad,los_ad,los_dp,los_dp,los_dp,los_dp,los_dp
Unnamed: 0_level_1,mean,min,max,count,first,mean,min,max,count,first
subject_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
10001186,1 days 20:00:40,1 days 07:50:00,2 days 05:05:00,9,2 days 05:05:00,0 days 06:11:00,0 days 06:11:00,0 days 06:11:00,3,0 days 06:11:00
10007174,1 days 10:08:40,0 days 10:19:00,2 days 00:09:00,3,1 days 19:58:00,0 days 13:17:20,0 days 06:03:00,0 days 22:05:00,3,0 days 11:44:00
10018852,4 days 05:27:00,1 days 22:21:00,8 days 08:00:00,3,1 days 22:21:00,0 days 10:07:00,0 days 10:07:00,0 days 10:07:00,1,0 days 10:07:00
10024331,3 days 11:38:47.142857142,0 days 20:02:00,9 days 05:13:00,28,1 days 11:35:00,0 days 06:19:15,0 days 02:15:00,0 days 11:38:00,24,0 days 03:45:00
10025647,3 days 17:28:45,0 days 19:03:00,9 days 21:08:00,16,3 days 02:18:00,0 days 08:29:42.857142857,0 days 03:20:00,1 days 08:14:00,14,0 days 04:44:00


In [30]:
for i ,_patients_list in enumerate([patients_list, both_ibd_patients, only_cd_patients, only_uc_patients]):
    stats_tmp = stats[stats.index.isin(_patients_list)]
    print('Group {}'.format(i))
    print('  Mean LOS on Admission: {} (+-{})'.format(stats_tmp['los_ad']['first'].mean(), stats_tmp['los_ad']['first'].std()))
    # break

Group 0
  Mean LOS on Admission: 4 days 19:25:22.120051085 (+-5 days 23:17:51.516531409)
Group 1
  Mean LOS on Admission: 4 days 18:49:58.588235294 (+-5 days 01:47:57.468224811)
Group 2
  Mean LOS on Admission: 4 days 14:43:11.932367149 (+-6 days 05:57:37.910147999)
Group 3
  Mean LOS on Admission: 5 days 01:11:13.385518591 (+-5 days 16:24:42.168597360)


## LOS on DP

In [31]:
for i ,_patients_list in enumerate([patients_list, both_ibd_patients, only_cd_patients, only_uc_patients]):
    stats_tmp = stats[stats.index.isin(_patients_list)]
    print('Group {}'.format(i))
    print('  Mean LOS on DP: {} (+-{})'.format(stats_tmp['los_dp']['first'].mean(), stats_tmp['los_dp']['first'].std()))
    # break

Group 0
  Mean LOS on DP: 0 days 08:07:56.766121270 (+-0 days 06:27:33.496867075)
Group 1
  Mean LOS on DP: 0 days 07:06:40.481927710 (+-0 days 03:46:12.957682565)
Group 2
  Mean LOS on DP: 0 days 08:34:10.575342465 (+-0 days 07:34:29.950373287)
Group 3
  Mean LOS on DP: 0 days 07:41:41 (+-0 days 04:58:29.599550584)
