In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

In [None]:
df = pd.read_csv('mimic-iii-clinical-database-1.4/DIAG_ROWS.csv', dtype='str')
print("Unique HADM_ID: ", len(df['HADM_ID'].unique()))

df.head()

In [None]:
len(df[df['DISEASES AND INJURIES'] == '42731'])

In [None]:
ICD9 = pd.read_csv('ICD9_CODES.csv')

ICD9[ICD9['1'] == '401']

In [None]:
def encode(name):
    # count values and calculate info_content for leaf child
    IC = pd.DataFrame(df[name].value_counts()).rename(columns={name: "Frekvens"})
    IC['0'] = IC.index
    IC['Total rows'] = df[name].size
    IC['Probability'] = IC['Frekvens']/df[name].size
    IC['IC'] = - np.log10(IC['Probability'])
    
    # get parents
    ICD9_P = ICD9[ICD9['LABEL'] == name]
    parents = ICD9_P[ICD9_P['0'].isin(df[name])]
    gb = parents.groupby('1')
    
    # calculate the IC for each parent based on child
    for parent, group in tqdm(gb):
        
        group = pd.merge(group, IC, on='0', how='left')
        val = np.sum(group['IC'])
        
        if (parent == '401'):
            print(group)
            print(val)
    
        IC.loc[IC['0'].isin(group['0']), 'PARENT_IC'] = val
        IC.loc[IC['0'].isin(group['0']), 'INDEX'] = range(1, len(group.index)+1)
    
    # encode with added cnstant
    IC['Constant'] = 0.1
    IC['ENCODE'] = IC['PARENT_IC'] + IC['Constant']*IC['INDEX']
    
    IC = IC.rename(columns={'0':name})
    return IC
    

In [None]:
IC_DIS = encode('DISEASES AND INJURIES')
IC_EXT = encode('SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING')
IC_FACT = encode('SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES')
IC_SYMP = encode('SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS')

In [None]:
df_merge = IC_DIS[['DISEASES AND INJURIES', 'ENCODE']]
df = pd.merge(df, df_merge, on='DISEASES AND INJURIES', how='left')

df_merge = IC_SYMP[['SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS', 'ENCODE']]
df = pd.merge(df, df_merge, on='SYMPTOMS, SIGNS, AND ILL-DEFINED CONDITIONS', how='left')

df_merge = IC_FACT[['SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES', 'ENCODE']]
df = pd.merge(df, df_merge, on='SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES', how='left')

df_merge = IC_EXT[['SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING', 'ENCODE']]
df = pd.merge(df, df_merge, on='SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING', how='left')

df.head()

In [None]:
IC_DIS[IC_DIS['DISEASES AND INJURIES'].isin(['4011', '4010', '4019'])]

In [None]:
IC_DIS[IC_DIS['DISEASES AND INJURIES'] == '0389']

In [None]:
df.to_csv('ENC.csv', index=False)

In [None]:
IC_DIS