In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

# Loading Data

In [None]:
df = (
    pd.read_csv('./data/icu_ibd_all_table.csv')
    .assign(intime = lambda x: pd.to_datetime(x['intime']))
)

print(df.shape)
print(df.head())

In [None]:
df2 = (
    pd.read_csv('./data/patients_ibd.csv')
    .assign(anchor_year=lambda x: pd.to_datetime(x['anchor_year'].astype(str) + '-01-01'))
)

print(df2.shape)
print(df2.head())

# Preprocess Data

## Make a Combination

In [None]:
data = (
    df.merge(
        df2[['subject_id', 'anchor_year']], 
        on='subject_id', 
        how='left',
    ) # 需要 df2 的 anchor_year 信息
    .sort_values(by=['subject_id', 'intime'],) # 每个 subject_id 按照入院时间 intime 排序
    .assign(
        age = lambda x: ((x['intime'] - x['anchor_year']).dt.days) / 365 + x['age']
    )
    .drop(columns=['outtime', 'anchor_year'])
)

print(data.shape)
print(data.head())

## Drop Columns (Missing Value + Useless)

In [None]:
# 删除缺失值过多的字段
cols_missing = []

for col in data.columns:
    if data[col].isna().sum() / data.shape[0]*100 > 5:
        cols_missing.append(col)
        # print(
        #     f'{col} => '
        #     f'NA Count: {data[col].isna().sum()} ({data[col].isna().sum() / data.shape[0]*100:.2f}%)'
        #     '\n'
        # )

print('缺失值过多的字段')
print(cols_missing)

In [None]:
# 删除不需要的字段
cols_useless = [
    'hadm_id',
    'mortality',
]

print('手动删除的字段')
print(cols_useless)

In [None]:
cols_except = cols_missing + cols_useless

print(f'Cols (before): {data.shape[1]}')
data.drop(columns=cols_except, inplace=True)
print(f'Cols (after): {data.shape[1]}')

## Drop Rows (Missing Value)

In [None]:
# 删除缺失值过多的行
print(f'Rows (before): {data.shape[0]}')
data = data[~(data.isna().sum(axis=1) > (data.shape[1]+1)*0.05)]
print(f'Rows (after): {data.shape[0]}')

## Process Race

In [None]:
def parse_race(race):
    if 'WHITE' in race:
        return 'WHITE'
    elif 'BLACK' in race:
        return 'BLACK'
    elif 'HISPANIC' in race or 'LATINO' in race:
        return 'HISPANIC/LATINO'
    elif 'ASIAN' in race:
        return 'ASIAN'
    else:
        return 'OTHER'

data.loc[:, 'race'] = data.loc[:, 'race'].apply(parse_race)

## Format Dummy Variables

In [None]:
# 对 type 做 drop_first
data.drop(columns=['uc_cd'], inplace=True)

In [None]:
# 需要转换为 dummy variable 的字段
cols = ['language', 'race', 'marital_status', 'insurance']

for col in cols:
    tmp = data[col].value_counts(dropna=False)
    res = pd.DataFrame({'index': tmp.index, 'count': tmp.values, 'percentage': tmp.values/tmp.sum()*100})
    print(f'{res.set_index("index")}')
    print()

In [None]:
# 处理 dummy variables
data = pd.get_dummies(data=data, 
                      columns=cols, 
                      prefix=cols, 
                      drop_first=True)

data.columns

## Add ICU Count

In [None]:
data['icu_count'] = (
    data.sort_values(by=['subject_id', 'intime'])
    .groupby('subject_id').cumcount() + 1
)

# Normalization

In [None]:
cols = ['age', 'heart_rate', 'respiratory_rate', 'hematocrit', 'rdw', 'platelet', 'mcv', 'mch', 'hemoglobin', 'icu_count']

# scaler = StandardScaler()
scaler = MinMaxScaler()

data[cols] = scaler.fit_transform(data[cols])

In [None]:
# 转换所有 bool 类型为 int
for k, v in data.dtypes.items():
    if v == bool:
        data[k] = data[k].astype(int)

# Export Data

## First ICU Record

In [None]:
(
    data
    # .drop_duplicates(subset='subject_id', keep='first',)
    # .drop(columns=['die_in_icu'])
    .to_csv('./data_processed/data_first_record_.csv')
    # .columns
)

In [None]:
plt.figure(figsize=(12, 6))

plt.scatter(data['los'],
            data['subject_id'], 
            alpha=0.9, 
            s=data['icu_count']*100,
            c=data['subject_id'].astype('category').cat.codes,
            cmap='viridis'
            )  # alpha用于设置点的透明度

plt.xticks(rotation=45)

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 8), dpi=300)
ax = fig.add_subplot(111, projection='3d')

color_labels = data['subject_id'].astype('category').cat.codes

scatter = ax.scatter(data['los'], 
                     data['subject_id'], 
                     data['icu_count'],  # 将icu_count作为第三个维度
                     alpha=0.9, 
                     s=np.exp(data['icu_count']*5),  # 也可以使用icu_count调整点的大小
                     c=color_labels, 
                     cmap='jet')


ax.set_xlabel('Length of Stay (days)')
ax.set_ylabel('Subject ID')
ax.set_zlabel('ICU Count')
ax.set_title('3D Plot of ICU Data')

plt.colorbar(scatter, ax=ax, label='Subject ID')


In [None]:
data.merge(df2[['subject_id', 'anchor_year_group']],
           on='subject_id', how='left').anchor_year_group.value_counts()

In [None]:
data