In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append('./utils')
from env import load_env_file
from stats import plot_distribution
from tqdm import tqdm

DATA_DIR = load_env_file()

load env file
  root dir:
    /Users/k/Repo/gp-ibd
  current system:
    Darwin
  loaded env file:
    .env.darwin
  loaded data dir:
    /Users/k/Nutstore Files/毕设-EHR/DB
done.


In [2]:
'''读取患者列表
    p_lists: dict
    - both_ibd
    - only_uc
    - only_cd
'''
with open(os.path.join(DATA_DIR, 'data', 'patients_lists.json'), 'r') as f:
    p_lists = json.loads(f.read())

print('total patients num:\n  {}'.format(len(p_lists['both_ibd'] + p_lists['only_uc'] + p_lists['only_cd'])))
print('both uc and cd:\n  {}'.format(len(p_lists['both_ibd'])))
print('only uc:\n  {}'.format(len(p_lists['only_uc'])))
print('only cd:\n  {}'.format(len(p_lists['only_cd'])))

total patients num:
  2417
both uc and cd:
  85
only uc:
  1052
only cd:
  1280


In [None]:
'''IBD 病患的人口学数据
    rows
    - subject_id
    columns
    - gender
    - anchor_age
    - anchor_year_group
    - dod
'''
df = pd.read_csv(os.path.join(DATA_DIR, 'complication', 'Complications_Patients.csv'))
df_demography = df.groupby('subject_id').agg({
    'gender': 'first',
    'anchor_age': 'first',
    'anchor_year_group': 'first',
    'dod': 'first',
})

df_demography = df_demography.loc[p_lists['only_cd'], :]
df_demography.shape

In [None]:
'''同一种 icd code 有多少 IBD 病患的诊断记录
    row
    - icd_code
    - count
'''
stats = df.groupby(['subject_id', 'icd_code']).agg({
    'icd_code': ['count'],
})
stats.columns = ['count']
stats = stats.reset_index()

icd_code_counts = stats.groupby('icd_code').agg({
    'subject_id': ['nunique']
})
icd_code_counts.columns = ['count']
icd_code_counts.head()

In [None]:
'''获取所有 count 超过阈值的 icd code list
'''
def get_suspect_icd_code(threshold=300) -> list:
    ibd_icd_codes = ['5550', '5551', '5552', '5559', '5560', '5561', '5562', '5563', '5564', '5565', '5566', '5568', '5569']
    suspect_icd_code = [_ for _ in icd_code_counts[icd_code_counts['count'] > threshold].index.tolist() if _ not in ibd_icd_codes]
    return suspect_icd_code

In [None]:
'''X (2417, 2 + len(suspect_icd_code))

    从 df 中根据 suspect icd code 提取
    X > rows
    - subject_id
    X > columns
    - gender 
    - age
    - icd code A
    - icd code B
    - icd code C
    - ...
'''
suspect_icd_code = get_suspect_icd_code(120)
patients_subject_id = df_demography.index.to_list()

X = np.zeros((len(patients_subject_id), 2 + len(suspect_icd_code)))
for i in tqdm(range(len(patients_subject_id))):
    subject_id = patients_subject_id[i]
    filtered_df = df[df['subject_id'] == subject_id]
    
    X[i, 0] = 1 if df_demography.loc[subject_id, 'gender'] == 'M' else 0
    X[i, 1] = 1 if int(df_demography.loc[subject_id, 'anchor_age']) >= 60 else 0

    for j in range(len(suspect_icd_code)):
        X[i, 2 + j] = 1 if (filtered_df['icd_code'] == suspect_icd_code[j]).any() else 0

X.shape

In [None]:
'''y (2417,)
'''
y = np.array(pd.notnull(df_demography.dod).astype(int))
print('  dead nums: {} live nums: {}'.format(y[y == 1].shape, y[y == 0].shape))

In [None]:
'''保存 X 和 y 至 R 项目目录
'''
np.savetxt('../r scripts/data/X.csv', 
           X, 
           delimiter=',', 
           header=','.join(['gender', 'age'] + [str(_) for _ in suspect_icd_code]), 
           comments='')
np.savetxt('../r scripts/data/y.csv', 
           y, 
           delimiter=',', 
           header='dod', 
           comments='')