In [24]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats

# Loading Data

In [25]:
df = pd.read_csv('./data/icu_ibd_all_table.csv')
df['intime'] = pd.to_datetime(df['intime'])
df.shape

(1161, 32)

In [26]:
df2 = pd.read_csv('./data/patients_ibd.csv')

func = lambda x: pd.to_datetime('{}-01-01'.format(x))
df2['anchor_year'] = df2['anchor_year'].apply(func)

df2.shape

(2417, 6)

# Preprocess Data

## Make a Combination

In [27]:
data = df.merge(df2[['subject_id', 'anchor_year']], on='subject_id', how='left')
data.sort_values(by=['subject_id', 'intime'], inplace=True)
data['age'] = ((data['intime'] - data['anchor_year']).dt.days) / 365 + data['age']

In [28]:
data.head()

Unnamed: 0,hadm_id,subject_id,intime,outtime,los,mortality,gender,age,weight,bmi,...,CRP,race,language,marital_status,insurance,die_in_icu,uc_only,cd_only,uc_cd,anchor_year
327,22643604,10024331,2141-03-18 19:36:08,2141-03-22 22:08:21,4.10571,1,1,73.210959,,,...,,WHITE,ENGLISH,MARRIED,Medicare,0,1,0,0,2140-01-01
1022,28899194,10025647,2176-09-22 17:57:15,2176-09-24 17:11:19,1.9681,1,1,84.726027,,,...,,WHITE,ENGLISH,MARRIED,Medicare,0,1,0,0,2175-01-01
896,27617929,10037975,2185-01-17 19:12:12,2185-01-22 16:16:52,4.87824,1,1,60.043836,,,...,,UNKNOWN,ENGLISH,MARRIED,Medicare,1,0,1,0,2185-01-01
110,20845468,10048262,2168-08-21 00:21:53,2168-08-21 11:03:57,0.44588,0,1,46.641096,,,...,,WHITE,ENGLISH,MARRIED,Medicare,0,1,0,0,2163-01-01
908,27715453,10056223,2122-09-23 15:08:45,2122-09-28 16:07:53,5.04106,0,1,50.728767,,,...,,HISPANIC/LATINO - DOMINICAN,?,SINGLE,Medicaid,0,1,0,0,2120-01-01


## Delete Missing Value and Useless Columns

In [29]:
# 检查缺失值
cols_missing = []

for col in data.columns:
    if data[col].isna().sum() / data.shape[0]*100 > 5:
        cols_missing.append(col)
        # print(f'{col}:\n  NA count: {data[col].isna().sum()} ({data[col].isna().sum() / data.shape[0]*100:.2f}%)\n')

In [30]:
# 删除不需要的列

cols_useless = [
    'hadm_id',
    # 'intime',
    'outtime',
    'mortality',
    # 'die_in_icu',
    'anchor_year',
]

cols_except = cols_missing + cols_useless


data.drop(columns=cols_except, inplace=True)

In [31]:
data.head()

Unnamed: 0,subject_id,intime,los,gender,age,heart_rate,respiratory_rate,hematocrit,rdw,platelet,...,mch,hemoglobin,race,language,marital_status,insurance,die_in_icu,uc_only,cd_only,uc_cd
327,10024331,2141-03-18 19:36:08,4.10571,1,73.210959,70.0,16.0,29.1,12.8,151.0,...,30.2,9.4,WHITE,ENGLISH,MARRIED,Medicare,0,1,0,0
1022,10025647,2176-09-22 17:57:15,1.9681,1,84.726027,70.0,19.0,31.2,15.3,192.0,...,27.7,10.5,WHITE,ENGLISH,MARRIED,Medicare,0,1,0,0
896,10037975,2185-01-17 19:12:12,4.87824,1,60.043836,90.0,37.0,38.1,16.9,16.0,...,30.7,12.5,UNKNOWN,ENGLISH,MARRIED,Medicare,1,0,1,0
110,10048262,2168-08-21 00:21:53,0.44588,1,46.641096,101.0,26.0,,,,...,,,WHITE,ENGLISH,MARRIED,Medicare,0,1,0,0
908,10056223,2122-09-23 15:08:45,5.04106,1,50.728767,89.0,20.0,26.1,15.4,77.0,...,31.0,9.4,HISPANIC/LATINO - DOMINICAN,?,SINGLE,Medicaid,0,1,0,0


## Drop Rows with Too Much Missing Value 

In [32]:
data = data[~(data.isna().sum(axis=1) > (data.shape[1]+1)*0.05)]

In [33]:
print(data.subject_id.unique().shape)

(652,)


## Formatting Race

In [34]:
def parse_race(race):
    if 'WHITE' in race:
        return 'WHITE'
    elif 'BLACK' in race:
        return 'BLACK'
    elif 'HISPANIC' in race or 'LATINO' in race:
        return 'HISPANIC/LATINO'
    elif 'ASIAN' in race:
        return 'ASIAN'
    else:
        return 'OTHER'


data.loc[:, 'race'] = data.loc[:, 'race'].apply(parse_race)

## Formatting Dummy Variables

In [35]:
data.drop(columns=['uc_cd'], inplace=True)

In [36]:
# 处理 dummy variables
cols = ['language', 'race', 'marital_status', 'insurance']

# for col in cols:
#     tmp = data[col].value_counts()
#     res = pd.DataFrame({'index': tmp.index, 'count': tmp.values, 'percentage': tmp.values/tmp.sum()*100})
#     print(f'{res.set_index("index")}')
#     print()
    

data = pd.get_dummies(data=data, 
                      columns=cols, 
                      prefix=cols, 
                      drop_first=True)

data.columns

Index(['subject_id', 'intime', 'los', 'gender', 'age', 'heart_rate',
       'respiratory_rate', 'hematocrit', 'rdw', 'platelet', 'mcv', 'mch',
       'hemoglobin', 'die_in_icu', 'uc_only', 'cd_only', 'language_ENGLISH',
       'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE',
       'marital_status_MARRIED', 'marital_status_SINGLE',
       'marital_status_WIDOWED', 'insurance_Medicare', 'insurance_Other'],
      dtype='object')

## Standalization

In [37]:
cols = ['age', 'heart_rate', 'respiratory_rate', 'hematocrit', 'rdw', 'platelet', 'mcv', 'mch', 'hemoglobin',]

scaler = StandardScaler()

data[cols] = scaler.fit_transform(data[cols])

In [38]:
# 转换所有 bool 类型为 int
for k, v in data.dtypes.items():
    if v == bool:
        data.loc[:, k] = data.loc[:, k].astype(int)

# Export Data

## For Statistics

In [44]:
_ = (
    pd.merge(data, df.loc[:, ['subject_id', 'race', 'language', 'marital_status', 'insurance']], on='subject_id', how='left')
)

_.race = _.race.apply(parse_race)


(
    _.drop_duplicates(subset='subject_id', keep='first',)
    .to_csv('./data_processed/data_stats.csv')
)

## First ICU Record

In [19]:
(
    data.drop_duplicates(subset='subject_id', keep='first',)
    .drop(columns=['die_in_icu'])
    .to_csv('./data_processed/data_first_record.csv')
)

## Dead in ICU

In [23]:
(
    data[data.die_in_icu == 1]
    .drop(columns=['die_in_icu'])
    .to_csv('./data_processed/data_die_in_icu.csv')
)