In [None]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# for jupyter notebook
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
df = pd.read_csv('~/ICD9CM.csv.gz')
#df = df[['Class ID', 'Preferred Label', 'Parents']]

## Search data

In [None]:
df['Class ID'] = df['Class ID'].str.lower()

df[df['Class ID'].str.contains("/0.?1.?1.?6.?6", na=False)]['Preferred Label'].values

# Split the URL to get codes

In [None]:
def splitICD9(line):
    if isinstance(line, str):
        return line.rsplit('/', 1)[1]
    return line

In [None]:
# Only keep the notation 
df['Class ID'] = df['Class ID'].progress_map(lambda x: splitICD9(x))
df['Parents'] = df['Parents'].progress_map(lambda x: splitICD9(x))

# Make dataframe containing codes with all the parents

First we need to figure out all the super parents and stop adding parents til we find them. Current super parents found are following. The first four are marked with the label owl#Thing

* '001-99999': 'DISEASES AND INJURIES',
* '00-9999': 'PROCEDURES',
* 'E000-E9999':'SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING',
* 'V01-V9199':'SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES',
* 'T051': 'Event',
* 'T071': 'Entity'

In [None]:
last = ['owl#Thing', 'T051', 'T071']

In [None]:
ICD9_df = pd.DataFrame()

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    current = row
    tree = []
    
    # add label and code to the row
    tree.append(current['Preferred Label'])
    tree.append(current['Class ID'])
 
    # add patents to the row
    while (current['Parents'] not in last) and (current['Class ID'] not in last):
        current = df.loc[df.index[df['Class ID'] == current['Parents']][0]]
        tree.append(current['Class ID'])
    
    # Add row to the dataframe
    if ICD9_df.empty:
        ICD9_df = pd.DataFrame([tree])
    else:
        ICD9_df = ICD9_df.append([tree], ignore_index=True)


In [None]:
#ICD9_df.head()

In [None]:
ICD9_df.to_csv('~/ICD9_CODES',index=False)

# Split between procedure and diagnose

In [None]:
# SPLIT BETWEEN DIADNOSE AND PROCEDURES

df_diag = pd.DataFrame()
df_symp = pd.DataFrame()

for index, row in tqdm(ICD9_df.iterrows(), total=ICD9_df.shape[0]):
    
    # get superclass
    index = row.last_valid_index()
    label = row[index]
    
    # create diagnose dataframe
    if label == '001-999.99':
        if df_diag.empty:
            df_diag = pd.DataFrame([row])
        else:
            df_diag = df_diag.append([row],ignore_index=True)
    
    # create procedures dataframe
    if label == '00-99.99':
        if df_symp.empty:
            df_symp = pd.DataFrame([row])
        else:
            df_symp = df_symp.append([row],ignore_index=True)

In [None]:
df_diag.to_csv('~/ICD9_DIAG_CODE', index=False)
df_symp.to_csv('~/ICD9_PROC_CODE', index=False)

In [None]:
df = df[['Class ID', 'Preferred Label']]
df

In [None]:
df.to_csv('~/ICD9_LABEL', index=False)

In [None]:
df_symp

In [None]:
ICD9_df

In [None]:
ICD9_df.to_csv('~/ICD9_CODES',index=False)

In [None]:
ICD9_df = pd.read_csv('/home/sohaib/ICD9_CODES')

#ICD9_df[ICD9_df['1'] == '14.5']

In [None]:
# 40.7 not included for some reason
# needs to be added manualy