In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 读取数据

In [2]:
df = pd.read_csv('./data/icu_ibd_all_table.csv')
df['intime'] = pd.to_datetime(df['intime'])
df.shape

(1161, 32)

In [3]:
df2 = pd.read_csv('./data/patients_ibd.csv')

func = lambda x: pd.to_datetime('{}-01-01'.format(x))
df2['anchor_year'] = df2['anchor_year'].apply(func)

df2.shape

(2417, 6)

In [4]:
data = df.merge(df2[['subject_id', 'anchor_year']], on='subject_id', how='left')
data.sort_values(by=['subject_id', 'intime'], inplace=True)
data['age'] = ((data['intime'] - data['anchor_year']).dt.days) / 365 + data['age']

In [5]:
# 检查缺失值
cols_missing = []

for col in data.columns:
    if data[col].isna().sum() / data.shape[0]*100 > 5:
        cols_missing.append(col)
        print(f'{col}:\n  NA count: {data[col].isna().sum()} ({data[col].isna().sum() / data.shape[0]*100:.2f}%)\n')

weight:
  NA count: 1066 (91.82%)

bmi:
  NA count: 1070 (92.16%)

systolic_pressure:
  NA count: 1099 (94.66%)

diastolic_pressure:
  NA count: 1099 (94.66%)

temperature:
  NA count: 962 (82.86%)

white_blood_cell:
  NA count: 1155 (99.48%)

red_blood_cell:
  NA count: 705 (60.72%)

CRP:
  NA count: 1161 (100.00%)



# 数据预处理

In [6]:
# 删除不需要的列

cols_useless = [
    'hadm_id',
    'intime',
    'outtime',
    'mortality',
    'die_in_icu',
    'anchor_year',
]

cols_except = cols_missing + cols_useless


data.drop(columns=cols_except, inplace=True)

In [7]:
print(data.columns)

Index(['subject_id', 'los', 'gender', 'age', 'heart_rate', 'respiratory_rate',
       'hematocrit', 'rdw', 'platelet', 'mcv', 'mch', 'hemoglobin', 'race',
       'language', 'marital_status', 'insurance', 'uc_only', 'cd_only',
       'uc_cd'],
      dtype='object')


## 格式化 race

In [8]:
data = data[~(data.isna().sum(axis=1) > (data.shape[1]+1)*0.05)]

# 只保留每个病人的第一次 ICU 记录
data.drop_duplicates(subset='subject_id', keep='first', inplace=True)

In [9]:
def parse_race(race):
    if 'WHITE' in race:
        return 'WHITE'
    elif 'BLACK' in race:
        return 'BLACK'
    elif 'HISPANIC' in race or 'LATINO' in race:
        return 'HISPANIC/LATINO'
    elif 'ASIAN' in race:
        return 'ASIAN'
    else:
        return 'OTHER'


data.loc[:, 'race'] = data.loc[:, 'race'].apply(parse_race)

In [10]:
data.drop(columns=['subject_id', 'uc_cd'], inplace=True)

In [11]:
# 处理 dummy variables

for col in ['language', 'race', 'marital_status', 'insurance']:
    tmp = data[col].value_counts()
    res = pd.DataFrame({'index': tmp.index, 'count': tmp.values, 'percentage': tmp.values/tmp.sum()*100})
    print(f'{res.set_index("index")}')
    print()
    

data = pd.get_dummies(data=data, 
                   columns=['race', 'language', 'marital_status', 'insurance'], 
                   prefix=['race', 'language', 'marital_status', 'insurance'], 
                   drop_first=True)

data.columns

         count  percentage
index                     
ENGLISH    623   95.552147
?           29    4.447853

                 count  percentage
index                             
WHITE              534   81.901840
OTHER               53    8.128834
BLACK               45    6.901840
HISPANIC/LATINO     15    2.300613
ASIAN                5    0.766871

          count  percentage
index                      
MARRIED     303    47.34375
SINGLE      213    33.28125
WIDOWED      79    12.34375
DIVORCED     45     7.03125

          count  percentage
index                      
Other       327   50.153374
Medicare    287   44.018405
Medicaid     38    5.828221



Index(['los', 'gender', 'age', 'heart_rate', 'respiratory_rate', 'hematocrit',
       'rdw', 'platelet', 'mcv', 'mch', 'hemoglobin', 'uc_only', 'cd_only',
       'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE',
       'language_ENGLISH', 'marital_status_MARRIED', 'marital_status_SINGLE',
       'marital_status_WIDOWED', 'insurance_Medicare', 'insurance_Other'],
      dtype='object')

In [12]:
# 标准化处理

cols = ['age', 'heart_rate', 'respiratory_rate', 'hematocrit', 'rdw', 'platelet', 'mcv', 'mch', 'hemoglobin',]

tmp = data.loc[:, cols]

tmp = ( tmp - tmp.min() ) / ( tmp.max() - tmp.min() )

data.loc[:, cols] = tmp

In [13]:
# 转换所有 bool 类型为 int
for k, v in data.dtypes.items():
    if v == bool:
        data.loc[:, k] = data.loc[:, k].astype(int)

In [14]:
# data['los'] = np.log(data['los'] + 1) # 对 los 做 +1log 处理

data.to_csv('./data.csv')

In [15]:
data.describe()

Unnamed: 0,los,gender,age,heart_rate,respiratory_rate,hematocrit,rdw,platelet,mcv,mch,...,race_BLACK,race_HISPANIC/LATINO,race_OTHER,race_WHITE,language_ENGLISH,marital_status_MARRIED,marital_status_SINGLE,marital_status_WIDOWED,insurance_Medicare,insurance_Other
count,652.0,652.0,652.0,652.0,652.0,652.0,652.0,651.0,652.0,652.0,...,652.0,652.0,652.0,652.0,652.0,652.0,652.0,652.0,652.0,652.0
mean,3.290308,0.480061,0.545993,0.404975,0.271034,0.502072,0.590456,0.208523,0.771952,0.733008,...,0.069018,0.023006,0.081288,0.819018,0.955521,0.464724,0.326687,0.121166,0.440184,0.501534
std,4.353613,0.499986,0.215068,0.123228,0.140274,0.174386,0.091914,0.139935,0.073038,0.07781,...,0.25368,0.150038,0.273487,0.385299,0.206314,0.499137,0.469362,0.32657,0.49679,0.500382
min,0.318137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.142935,0.0,0.39319,0.31875,0.166667,0.374631,0.526515,0.114324,0.735043,0.695332,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,1.9115,0.0,0.555374,0.396875,0.25,0.5,0.57197,0.185053,0.769231,0.739558,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
75%,3.566935,1.0,0.717939,0.475,0.357143,0.613569,0.632576,0.265569,0.811966,0.776413,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,46.9122,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
