# Installing Packages

In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [10]:
data_dir = 'data'
p_value = 0.05

# Loading Data

In [11]:
df = pd.read_csv(os.path.join(data_dir, 'complications_patients.csv'))

df.shape, df.columns

((122140, 11),
 Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version',
        'subject_id.1', 'gender', 'anchor_age', 'anchor_year',
        'anchor_year_group', 'dod'],
       dtype='object'))

# Building Data

In [12]:
df_demographics = df.groupby('subject_id').agg({
    'gender': 'first',
    'anchor_age': 'first',
    'anchor_year_group': 'first',
    'dod': 'first',
}).reset_index()

p_nums = df_demographics.shape[0]

df_demographics

Unnamed: 0,subject_id,gender,anchor_age,anchor_year_group,dod
0,10001186,F,46,2011 - 2013,
1,10007174,M,70,2011 - 2013,
2,10018852,M,19,2011 - 2013,
3,10024331,M,72,2008 - 2010,2145-01-23
4,10025647,M,83,2008 - 2010,2181-06-16
...,...,...,...,...,...
2412,19971435,F,30,2011 - 2013,
2413,19980334,F,86,2008 - 2010,2117-02-26
2414,19985000,F,30,2008 - 2010,
2415,19987152,F,37,2011 - 2013,


In [13]:
demographics = df_demographics.merge(
    (
        pd.read_csv(os.path.join(data_dir, 'admissions.csv'))
        .groupby('subject_id').last()
        .reset_index()
        .loc[:, ['subject_id', 'insurance', 'language', 'marital_status', 'race']]
    ), on='subject_id')

In [14]:
demographics.marital_status.value_counts()

marital_status
MARRIED     1151
SINGLE       907
WIDOWED      182
DIVORCED     158
Name: count, dtype: int64

In [15]:
def parse_race(race):
    if 'WHITE' in race:
        return 'WHITE'
    elif 'BLACK' in race:
        return 'BLACK'
    elif 'HISPANIC' in race or 'LATINO' in race:
        return 'HISPANIC/LATINO'
    elif 'ASIAN' in race:
        return 'ASIAN'
    else:
        return 'OTHER'

demographics['race'] = demographics['race'].apply(parse_race)

In [16]:
demographics

Unnamed: 0,subject_id,gender,anchor_age,anchor_year_group,dod,insurance,language,marital_status,race
0,10001186,F,46,2011 - 2013,,Other,ENGLISH,MARRIED,WHITE
1,10007174,M,70,2011 - 2013,,Other,ENGLISH,SINGLE,WHITE
2,10018852,M,19,2011 - 2013,,Other,ENGLISH,SINGLE,WHITE
3,10024331,M,72,2008 - 2010,2145-01-23,Medicare,ENGLISH,WIDOWED,WHITE
4,10025647,M,83,2008 - 2010,2181-06-16,Medicare,ENGLISH,MARRIED,WHITE
...,...,...,...,...,...,...,...,...,...
2412,19971435,F,30,2011 - 2013,,Other,ENGLISH,SINGLE,WHITE
2413,19980334,F,86,2008 - 2010,2117-02-26,Medicare,ENGLISH,MARRIED,WHITE
2414,19985000,F,30,2008 - 2010,,Other,ENGLISH,MARRIED,WHITE
2415,19987152,F,37,2011 - 2013,,Other,ENGLISH,MARRIED,WHITE


## y: dod

In [8]:
y =  demographics.loc[:, ['subject_id', 'dod']]

y.dod = (~y.dod.isna()).astype(int)

y

Unnamed: 0,subject_id,dod
0,10001186,0
1,10007174,0
2,10018852,0
3,10024331,1
4,10025647,1
...,...,...
2412,19971435,0
2413,19980334,1
2414,19985000,0
2415,19987152,0


## Part I: age

In [9]:
partI = demographics.loc[:, ['subject_id', 'anchor_age']]

partI[['anchor_age']] = scaler.fit_transform(partI[['anchor_age']])

partI

Unnamed: 0,subject_id,anchor_age
0,10001186,0.383562
1,10007174,0.712329
2,10018852,0.013699
3,10024331,0.739726
4,10025647,0.890411
...,...,...
2412,19971435,0.164384
2413,19980334,0.931507
2414,19985000,0.164384
2415,19987152,0.260274


## Part II: categorical demographic variables

In [10]:
partII = pd.get_dummies(
    demographics.loc[:, ['subject_id', 'gender', 'anchor_year_group', 'insurance', 'language', 'marital_status', 'race']], 
    drop_first=True
)

partII.loc[:, ~(partII.columns == 'subject_id')] = partII.loc[:, ~(partII.columns == 'subject_id')].astype(int)

partII

Unnamed: 0,subject_id,gender_M,anchor_year_group_2011 - 2013,anchor_year_group_2014 - 2016,insurance_Medicare,insurance_Other,language_ENGLISH,marital_status_MARRIED,marital_status_SINGLE,marital_status_WIDOWED,race_BLACK,race_HISPANIC/LATINO,race_OTHER,race_WHITE
0,10001186,0,1,0,0,1,1,1,0,0,0,0,0,1
1,10007174,1,1,0,0,1,1,0,1,0,0,0,0,1
2,10018852,1,1,0,0,1,1,0,1,0,0,0,0,1
3,10024331,1,0,0,1,0,1,0,0,1,0,0,0,1
4,10025647,1,0,0,1,0,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,19971435,0,1,0,0,1,1,0,1,0,0,0,0,1
2413,19980334,0,0,0,1,0,1,1,0,0,0,0,0,1
2414,19985000,0,0,0,0,1,1,1,0,0,0,0,0,1
2415,19987152,0,1,0,0,1,1,1,0,0,0,0,0,1


## Part III: ICD Codes

聚合到 (subject_id, icd_code) 层面

然后计算每个 icd code 分别有多少位不同的 IBD 病人

In [17]:
stats = (
    df.groupby(['subject_id', 'icd_code'])
    .agg(last_icd_version=('icd_version', 'last'))
    .reset_index()
    .groupby('icd_code')
    .agg(count=('subject_id', 'count'))
    .reset_index()
    .sort_values(by='count', ascending=False)
    .assign(percent=lambda x: x['count'] / p_nums)
)

ibd_icd_codes = ['5550', '5551', '5552', '5559', '5560', '5561', '5562', '5563', '5564', '5565', '5566', '5568', '5569', 'V4986']

stats.drop(stats[stats['icd_code'].isin(ibd_icd_codes)].index, inplace=True)

stats

Unnamed: 0,icd_code,count,percent
1082,4019,810,0.335126
1525,53081,678,0.280513
586,2859,523,0.216384
5905,V1582,503,0.208109
791,311,502,0.207695
...,...,...,...
503,27411,1,0.000414
3744,F430,1,0.000414
3746,F4312,1,0.000414
3751,F4325,1,0.000414


In [12]:
suspected_icd_codes = stats[stats['percent'] >= p_value].icd_code

suspected_icd_codes

1082     4019
1525    53081
586      2859
5905    V1582
791       311
        ...  
2218    71590
2540    78659
1109      412
6060    V5869
3460    E8490
Name: icd_code, Length: 99, dtype: object

In [13]:
pivot_data = pd.pivot_table(
    data=df,
    values='hadm_id',
    index='subject_id',
    columns='icd_code',
    aggfunc='count',
).loc[:, suspected_icd_codes]

pivot_data = (pivot_data > 0).astype(int)

pivot_data = pivot_data.reset_index()

partIII = pivot_data

partIII.rename_axis(columns=None, inplace=True)

partIII

Unnamed: 0,subject_id,4019,53081,2859,V1582,311,2724,5849,30000,27651,...,78702,51881,7840,78900,79092,71590,78659,412,V5869,E8490
0,10001186,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10007174,1,0,0,1,1,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,10018852,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10024331,1,1,1,0,1,1,1,0,1,...,0,0,0,0,0,0,1,1,0,0
4,10025647,1,0,0,1,1,1,1,0,1,...,0,0,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,19971435,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2413,19980334,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2414,19985000,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2415,19987152,0,0,0,1,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0


## Combining Each Parts

In [14]:
data = y.merge(
    partI,
    on='subject_id'
).merge(
    partII,
    on='subject_id'
).merge(
    partIII,
    on='subject_id'
).drop(columns=['subject_id'])

data

Unnamed: 0,dod,anchor_age,gender_M,anchor_year_group_2011 - 2013,anchor_year_group_2014 - 2016,insurance_Medicare,insurance_Other,language_ENGLISH,marital_status_MARRIED,marital_status_SINGLE,...,78702,51881,7840,78900,79092,71590,78659,412,V5869,E8490
0,0,0.383562,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0.712329,1,1,0,0,1,1,0,1,...,0,0,0,0,1,0,0,1,0,0
2,0,0.013699,1,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0.739726,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,1,0,0
4,1,0.890411,1,0,0,1,0,1,1,0,...,0,0,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,0,0.164384,0,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2413,1,0.931507,0,0,0,1,0,1,1,0,...,0,0,0,0,0,1,0,0,0,0
2414,0,0.164384,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2415,0,0.260274,0,1,0,0,1,1,1,0,...,1,0,0,1,0,0,0,0,0,0


In [15]:
data.to_csv('./data_preprocessed/data.csv')