In [23]:
import os
import csv
import sys
import argparse
import json

import pandas as pd
import numpy as np

import torch
from transformers import DataCollatorForLanguageModeling, BertForMaskedLM
from transformers import Trainer, TrainingArguments


# Convert Diag code

In [3]:
diagmap_path = os.path.join('/home/liutianc/emr/data/', 'diag_9to10.csv')
icd_csv = csv.reader(open(diagmap_path), delimiter='|')

In [4]:
icd_map = []
for code in open(diagmap_path).readlines():
    code=code.strip()
    if code.split("|")[1] == 'Flags':
        header = code.split("|")
        print(code)
    else:
        icd_map.append(code.split("|"))
        
icd_map_df = pd.DataFrame(icd_map).loc[:,:1]
icd_map_df.columns = ['icd9', 'icd10']

# Drop some outlier characters.
icd_map_df['icd9'] = icd_map_df['icd9'].str.replace("'", "", regex=False)
icd_map_df['icd10'] = icd_map_df['icd10'].str.replace("'", "", regex=False)

# Only keep rows with defined mapping relationships: coverage rate is 14,660 / 15,086 = 95.85%
icd_map_df = icd_map_df.loc[icd_map_df['icd10'] != '']

# Only keep rows with UNIQUE mapping relationships: coverage rate is 13,516 / 14,660 = 92.20%
icd9_mapnum = icd_map_df.groupby('icd9').count().reset_index()
icd9_unimap = set(list(icd9_mapnum.loc[icd9_mapnum['icd10'] == 1, 'icd9']))
icd_map_df = icd_map_df.loc[icd_map_df['icd9'].apply(lambda x: x in icd9_unimap)]

# Cast icd10 encode to our data version: coverage rate is 9261 / 9802 = 94.48%
icd_map_df['icd10'] = icd_map_df['icd10'].str.replace('.', '', regex=False).str.upper()

icd_map_df.shape

(13516, 2)

In [5]:
icd_map_df = icd_map_df.set_index('icd9')
icd_map_raw = icd_map_df.to_dict(orient='index')

In [7]:
icd_map = {}
for idx, icd9 in enumerate(icd_map_raw):
    value = icd_map_raw[icd9]['icd10']
    
    _icd9 = icd9.replace('.', '')
    keys = [_icd9]
    # Add zero: We only add 0 to the end.
    keys += [_icd9 + '0']
    # If the last two chars are not 00, then we will add double 00 to the end.
    if _icd9[-2:] != '00':
        keys += [_icd9 + '00']

    # Strip ALL 0.
    keys += [_icd9.strip('0')]

    for key in set(keys):
        if key in icd_map:
            icd_map[key] = None
        else:
            icd_map[key] = [value, icd9]


In [12]:
DATA_PATH = '/nfs/turbo/lsa-regier/emr-data'

vocab_path = os.path.join(DATA_PATH, 'vocabs', 'vocab_merged.json')

with open(vocab_path, 'r') as file:
    vocabs = json.load(file)
        
diag_icd9, diag_icd10 = [], []
proc_icd9, proc_icd10 = [], []
for token in vocabs:
    if 'diag' in token:
        if 'icd:9_' in token:
            token = token.split('_')[1]
            token = token.split(':')[1]
            diag_icd9.append(token)
        elif 'icd:10_' in token:
            token = token.split('_')[1]
            token = token.split(':')[1]
            diag_icd10.append(token)   
    elif 'proc' in token:
        if 'icd:9_' in token:
            token = token.split('_')[1]
            token = token.split(':')[1]
            proc_icd9.append(token)
        elif 'icd:10_' in token:
            token = token.split('_')[1]
            token = token.split(':')[1]
            proc_icd10.append(token)   
            
diag_icd9 = [token.replace('-', '') for token in diag_icd9]
diag_icd10 = [token.replace('-', '') for token in diag_icd10]

In [35]:
icd9 = set(diag_icd9)
icd10 = set(diag_icd10)

vc = [k for k in icd_map if icd_map[k] is not None]
ivc = [k for k in icd_map if icd_map[k] is None]

fail, success = [], []
for t in icd9:
    t_raw = t
    is_succ = 0
    if t_raw in vc:
        success.append(t_raw)
        is_succ = 1
    
    # Since we didn't add 0 to the beginning, removing possible extra 0 is done here.
    while t[0] == '0':
        t = t[1:]
        if t in vc:
            success.append(t_raw)
            is_succ = 1
            continue
    
    if is_succ == 0:
        fail.append(t_raw)

fail = set(fail)
success = set(success)
print(f'''
Total icd9 diag we have: {len(icd9)},
Caught by valid mapping: {len(success)},
Caught by invalid mapping: {len(fail.intersection(ivc))},
Not caught: {len(fail.difference(set(icd_map)))}
Actually ICD10: {len(fail.intersection(icd10))}
''')


Total icd9 diag we have: 26963,
Caught by valid mapping: 15234,
Caught by invalid mapping: 962,
Not caught: 10767
Actually ICD10: 3778



In [44]:
'icd:9_diag:26080'

True

In [21]:
icd_10 = {}
for icd in icd10:
    icd_10[icd] = [icd]

In [22]:
import json
icd_10_file = '/nfs/turbo/lsa-regier/emr-data/icd_10.json'
icd_map_file = '/nfs/turbo/lsa-regier/emr-data/icd_map.json'

with open(icd_10_file, 'w') as file:
            json.dump(icd_10, file)
        
with open(icd_map_file, 'w') as file:
            json.dump(icd_map, file)