In [18]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

from utils import load_env_file, set_mpl_configs
from utils import leave_percentile, distribution_analysis

load_env_file()
set_mpl_configs()

DATA_DIR = os.getenv('DATA_DIR')
print('DATA_DIR: {}'.format(DATA_DIR))

load env file
  root dir:
    /Users/k/Repo/gp-ibd
  current system:
    Darwin
  load .env.darwin
  loaded data dir:
    /Users/k/Nutstore Files/毕设-EHR/DB
done.
set matplotlib configs
  font family:
    ['Times New Roman']
done.
DATA_DIR: /Users/k/Nutstore Files/毕设-EHR/DB


In [76]:
df_ibd = pd.read_csv(os.path.join(DATA_DIR, 'complication', 'Complications_Patients.csv'))
df_ibd.head(10)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,subject_id.1,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10098672,21229395,1,9975,9,10098672,M,61,2140,2011 - 2013,
1,10098672,21229395,2,5990,9,10098672,M,61,2140,2011 - 2013,
2,10098672,21229395,3,5849,9,10098672,M,61,2140,2011 - 2013,
3,10098672,21229395,4,5559,9,10098672,M,61,2140,2011 - 2013,
4,10098672,21229395,5,5793,9,10098672,M,61,2140,2011 - 2013,
5,10098672,21229395,6,E8788,9,10098672,M,61,2140,2011 - 2013,
6,10098672,21229395,7,0413,9,10098672,M,61,2140,2011 - 2013,
7,10098672,21229395,8,0416,9,10098672,M,61,2140,2011 - 2013,
8,10098672,21229395,9,5853,9,10098672,M,61,2140,2011 - 2013,
9,10098672,21229395,10,2809,9,10098672,M,61,2140,2011 - 2013,


In [77]:
def is_ibd(some_icd_codes: [str]) -> bool:
    for icd_code in some_icd_codes:
        if (icd_code.startswith('555') or icd_code.startswith('556')) and len(icd_code) == 4:
            return True
    return False


def is_cd(some_icd_codes: [str]) -> bool:
    for icd_code in some_icd_codes:
        if icd_code in ['5550', '5551', '5552', '5559']:
            return True
    return False

def is_uc(some_icd_codes: [str]) -> bool:
    for icd_code in some_icd_codes:
        if icd_code in ['5560', '5561', '5562', '5563', '5564', '5565', '5566', '5568', '5569']:
            return True
    return False

In [101]:
stats = df_ibd.groupby('subject_id').agg({
    'gender': 'first',
    'anchor_age': 'first',
    'icd_code': [
        ('ibd', lambda x: is_ibd(x)), 
        ('cd', lambda x: is_cd(x)), 
        ('uc', lambda x: is_uc(x)),
    ]
})
print(stats.shape)
stats.head()

(2417, 5)


Unnamed: 0_level_0,gender,anchor_age,icd_code,icd_code,icd_code
Unnamed: 0_level_1,first,first,ibd,cd,uc
subject_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
10001186,F,46,True,True,False
10007174,M,70,True,True,False
10018852,M,19,True,False,True
10024331,M,72,True,False,True
10025647,M,83,True,False,True


In [102]:
# 只得了一种 IBD 的患者
one_ibd_stats = stats[~((stats['icd_code']['uc'] == True) & (stats['icd_code']['cd'] == True))]
# 得了两种 IBD 的患者
both_ibd_stats = stats[((stats['icd_code']['uc'] == True) & (stats['icd_code']['cd'] == True))]
# 得了 UC 的患者
uc_ibd_stats = stats[(stats['icd_code']['uc'] == True)]
# 得了 CD 的患者
uc_ibd_stats = stats[(stats['icd_code']['cd'] == True)]
# 只得了 UC 的患者
only_uc_ibd_stats = stats[((stats['icd_code']['uc'] == True) & (stats['icd_code']['cd'] == False))]
# 只得了 CD 的患者
only_cd_ibd_stats = stats[((stats['icd_code']['uc'] == False) & (stats['icd_code']['cd'] == True))]


In [82]:
print(both_ibd_stats.shape[0] + only_uc_ibd_stats.shape[0] + only_cd_ibd_stats.shape[0])
print(both_ibd_stats.shape[0], only_uc_ibd_stats.shape[0], only_cd_ibd_stats.shape[0])

2417
85 1052 1280


In [83]:
['5550', '5551', '5552', '5559'] + ['5560', '5561', '5562', '5563', '5564', '5565', '5566', '5568', '5569']

['5550',
 '5551',
 '5552',
 '5559',
 '5560',
 '5561',
 '5562',
 '5563',
 '5564',
 '5565',
 '5566',
 '5568',
 '5569']

In [87]:
import json

data = {
    'both_ibd': both_ibd_stats.index.tolist(),
    'only_uc': only_uc_ibd_stats.index.tolist(),
    'only_cd': only_cd_ibd_stats.index.tolist(),
}

with open('../data/ibd_demo.json', 'w') as f:
    json.dump(data, f)

In [96]:
df_ibd

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,subject_id.1,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10098672,21229395,1,9975,9,10098672,M,61,2140,2011 - 2013,
1,10098672,21229395,2,5990,9,10098672,M,61,2140,2011 - 2013,
2,10098672,21229395,3,5849,9,10098672,M,61,2140,2011 - 2013,
3,10098672,21229395,4,5559,9,10098672,M,61,2140,2011 - 2013,
4,10098672,21229395,5,5793,9,10098672,M,61,2140,2011 - 2013,
...,...,...,...,...,...,...,...,...,...,...,...
122135,14458834,29475856,26,V5864,9,14458834,F,86,2161,2011 - 2013,2162-03-15
122136,14458834,29475856,27,V5866,9,14458834,F,86,2161,2011 - 2013,2162-03-15
122137,14458834,29475856,28,V1254,9,14458834,F,86,2161,2011 - 2013,2162-03-15
122138,14458834,29475856,29,V1588,9,14458834,F,86,2161,2011 - 2013,2162-03-15


In [110]:
for _df in [stats, both_ibd_stats, only_cd_ibd_stats, only_uc_ibd_stats]:
    print('-'*100)
    print(_df.shape[0])
    print((_df[_df['gender']['first'] == 'M']).shape[0], (_df[_df['gender']['first'] == 'F']).shape[0])
    print((_df[_df['gender']['first'] == 'M']).shape[0] / _df.shape[0])
    print(_df['anchor_age']['first'].describe()[1:3])
    # break

----------------------------------------------------------------------------------------------------
2417
1075 1342
0.44476623913942903
mean    51.374845
std     18.552998
Name: first, dtype: float64
----------------------------------------------------------------------------------------------------
85
40 45
0.47058823529411764
mean    46.976471
std     19.190137
Name: first, dtype: float64
----------------------------------------------------------------------------------------------------
1280
541 739
0.42265625
mean    50.271094
std     17.989462
Name: first, dtype: float64
----------------------------------------------------------------------------------------------------
1052
494 558
0.4695817490494297
mean    53.073194
std     19.028182
Name: first, dtype: float64
