In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

In [None]:
df = pd.read_csv('mimic-iii-clinical-database-1.4/DIAG_ROWS.csv', dtype='str')
df = df.dropna()
df.head()

In [None]:
def encode(name):
    # count values and calculate info_content for leaf child
    IC = pd.DataFrame(df[name].value_counts()).rename(columns={name: "COUNT"})
    IC['0'] = IC.index
    IC['IC'] = - np.log(IC['COUNT']/df[name].size)
    
    # get parents
    ICD9 = pd.read_csv('ICD9_CODES.csv')
    ICD9 = ICD9[ICD9['LABEL'] == name]
    
    parents = ICD9[ICD9['0'].isin(df[name])]
    gb = parents.groupby('1')
    
    # calculate the IC for each parent based on child
    for parent, group in tqdm(gb):
        group = pd.merge(group, IC, on='0', how='left')
        val = np.sum(group['IC'])
    
        IC.loc[IC['0'].isin(group['0']), 'PARENT_IC'] = val
        IC.loc[IC['0'].isin(group['0']), 'INDEX'] = range(1, len(group.index)+1)
    
    # encode with added cnstant
    constant = 0.00001
    IC['ENCODE'] = IC['PARENT_IC'] + constant*IC['INDEX']
    return IC
    

In [None]:
IC_DIS = encode('DISEASES AND INJURIES')
IC_EXT = encode('SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING')
IC_FACT = encode('SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES')
IC_SYMP = encode('SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS')

In [None]:
IC_DIS.to_csv('ICD_DIS_ENCODING.csv', index=False)
IC_EXT.to_csv('ICD_EXT_ENCODING.csv', index=False)
IC_FACT.to_csv('ICD_FACT_ENCODING.csv', index=False)
IC_SYMP.to_csv('ICD_SYMP_ENCODING.csv', index=False)