In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append('./utils')
from env import load_env_file
from stats import plot_distribution
from tqdm import tqdm

DATA_DIR = load_env_file()

load env file
  root dir:
    c:\Users\sitdo\Documents\GitHub\IBD-EDA
  current system:
    Windows
  loaded env file:
    .env.windows
  loaded data dir:
    C:\Users\k\Nutstore\1\毕设-EHR\Db
done.


In [2]:
'''读取患者列表
    p_lists: dict
    - both_ibd
    - only_uc
    - only_cd
'''
with open(os.path.join(DATA_DIR, 'data', 'patients_lists.json'), 'r') as f:
    p_lists = json.loads(f.read())

print('total patients num:\n  {}'.format(len(p_lists['both_ibd'] + p_lists['only_uc'] + p_lists['only_cd'])))
print('both uc and cd:\n  {}'.format(len(p_lists['both_ibd'])))
print('only uc:\n  {}'.format(len(p_lists['only_uc'])))
print('only cd:\n  {}'.format(len(p_lists['only_cd'])))

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\k\\Nutstore\\1\\毕设-EHR\\Db\\data\\patients_lists.json'

In [6]:
'''IBD 病患的人口学数据
    rows
    - subject_id
    columns
    - gender
    - anchor_age
    - anchor_year_group
    - dod
'''
df = pd.read_csv(os.path.join(DATA_DIR, 'complication', 'Complications_Patients.csv'))
df_demography = df.groupby('subject_id').agg({
    'gender': 'first',
    'anchor_age': 'first',
    'anchor_year_group': 'first',
    'dod': 'first',
})

# df_demography = df_demography.loc[p_lists['only_cd'], :]
df_demography.shape

(2417, 4)

In [7]:
'''同一种 icd code 有多少 IBD 病患的诊断记录
    row
    - icd_code
    - count
'''
stats = df.groupby(['subject_id', 'icd_code']).agg({
    'icd_code': ['count'],
})
stats.columns = ['count']
stats = stats.reset_index()

icd_code_counts = stats.groupby('icd_code').agg({
    'subject_id': ['nunique']
})
icd_code_counts.columns = ['count']
icd_code_counts.head()

Unnamed: 0_level_0,count
icd_code,Unnamed: 1_level_1
29,1
30,4
31,2
38,1
39,1


In [8]:
'''获取所有 count 超过阈值的 icd code list
'''
def get_suspect_icd_code(threshold=300) -> list:
    ibd_icd_codes = ['5550', '5551', '5552', '5559', '5560', '5561', '5562', '5563', '5564', '5565', '5566', '5568', '5569', 'V4986']
    suspect_icd_code = [_ for _ in icd_code_counts[icd_code_counts['count'] > threshold].index.tolist() if _ not in ibd_icd_codes]
    # suspect_icd_code = ["2639", "2761", "2762", "27800", "2875", "3051", "311", "32723", "412", "4280", "51881", "56089", "56722", "5849", "5990", "99592", "99859", "D649", "F419", "N179", "V1582", "25000"]
    return suspect_icd_code

len(get_suspect_icd_code(0))

6532

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,subject_id.1,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10098672,21229395,1,9975,9,10098672,M,61,2140,2011 - 2013,
1,10098672,21229395,2,5990,9,10098672,M,61,2140,2011 - 2013,
2,10098672,21229395,3,5849,9,10098672,M,61,2140,2011 - 2013,
3,10098672,21229395,4,5559,9,10098672,M,61,2140,2011 - 2013,
4,10098672,21229395,5,5793,9,10098672,M,61,2140,2011 - 2013,
...,...,...,...,...,...,...,...,...,...,...,...
122135,14458834,29475856,26,V5864,9,14458834,F,86,2161,2011 - 2013,2162-03-15
122136,14458834,29475856,27,V5866,9,14458834,F,86,2161,2011 - 2013,2162-03-15
122137,14458834,29475856,28,V1254,9,14458834,F,86,2161,2011 - 2013,2162-03-15
122138,14458834,29475856,29,V1588,9,14458834,F,86,2161,2011 - 2013,2162-03-15


分割线 之前改用更先进的算法
---

In [9]:
'''X (2417, 2 + len(suspect_icd_code))

    从 df 中根据 suspect icd code 提取
    X > rows
    - subject_id
    X > columns
    - gender 
    - age
    - icd code A
    - icd code B
    - icd code C
    - ...
'''
suspect_icd_code = get_suspect_icd_code(120)
patients_subject_id = df_demography.index.to_list()

X = np.zeros((len(patients_subject_id), 3 + len(suspect_icd_code)))
for i in tqdm(range(len(patients_subject_id))):
    subject_id = patients_subject_id[i]
    filtered_df = df[df['subject_id'] == subject_id]
    
    X[i, 0] = 1 if df_demography.loc[subject_id, 'gender'] == 'M' else 0
    # X[i, 1] = 1 if int(df_demography.loc[subject_id, 'anchor_age']) >= 60 else 0
    age_pattern = [(0, 0), (0, 1), (1, 0)]
    if int(df_demography.loc[subject_id, 'anchor_age']) <= 35:
        X[i, 1] = age_pattern[0][0]
        X[i, 2] = age_pattern[0][1]
    elif int(df_demography.loc[subject_id, 'anchor_age']) <= 55:
        X[i, 1] = age_pattern[1][0]
        X[i, 2] = age_pattern[1][1]
    else:
        X[i, 1] = age_pattern[2][0]
        X[i, 2] = age_pattern[2][1]
    
    
    for j in range(len(suspect_icd_code)):
        X[i, 3 + j] = 1 if (filtered_df['icd_code'] == suspect_icd_code[j]).any() else 0

X.shape

100%|██████████| 2417/2417 [00:08<00:00, 294.73it/s]


(2417, 102)

In [8]:
'''y (2417,)
'''
y = np.array(pd.notnull(df_demography.dod).astype(int))
print('  dead nums: {} live nums: {}'.format(y[y == 1].shape, y[y == 0].shape))

  dead nums: (380,) live nums: (2037,)


In [10]:
'''保存 X 和 y 至 R 项目目录
'''
np.savetxt('../r scripts/data/X__.csv', 
           X, 
           delimiter=',', 
           header=','.join(['gender', 'age0', 'age1'] + [str(_) for _ in suspect_icd_code]), 
           comments='')
np.savetxt('../r scripts/data/y__.csv', 
           y, 
           delimiter=',', 
           header='dod',
           comments='')