In [1]:
import pandas as pd

In [2]:
codes = pd.read_csv('MIMIC/cleaned_data_full.csv').dropna()

In [3]:
keys = ['train', 'dev', 'test']
hadm_ids = {x:list(pd.read_csv('../caml-mimic/mimicdata/mimic3/%s_50_hadm_ids.csv'%x, header=None)[0]) for x in keys}
hadm_ids_total = hadm_ids['train'] + hadm_ids['dev'] + hadm_ids['test']

In [4]:
top_50_codes = list(pd.read_csv('../caml-mimic/mimicdata/mimic3/TOP_50_CODES.csv', header=None)[0])

In [5]:
from collections import defaultdict
import csv
def reformat(code, is_diag):
    code = ''.join(code.split('.'))
    if is_diag:
        if code.startswith('E'):
            if len(code) > 4:
                code = code[:4] + '.' + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + '.' + code[3:]
    else:
        code = code[:2] + '.' + code[2:]
    return code

def load_code_descriptions(version='mimic3'):
    desc_dict = defaultdict(str)
    DATA_DIR = '../caml-mimic/mimicdata'
    with open("%s/D_ICD_DIAGNOSES.csv" % (DATA_DIR), 'r') as descfile:
        r = csv.reader(descfile)
        next(r)
        for row in r:
            code = row[1]
            desc = row[-1]
            desc_dict[reformat(code, True)] = desc
            
    with open("%s/D_ICD_PROCEDURES.csv" % (DATA_DIR), 'r') as descfile:
        r = csv.reader(descfile)
        next(r)
        for row in r:
            code = row[1]
            desc = row[-1]
            if code not in desc_dict.keys():
                desc_dict[reformat(code, False)] = desc
                
    with open('%s/ICD9_descriptions' % DATA_DIR, 'r') as labelfile:
        for i,row in enumerate(labelfile):
            row = row.rstrip().split()
            code = row[0]
            if code not in desc_dict.keys():
                desc_dict[code] = ' '.join(row[1:])
    return desc_dict

In [6]:
desc_dict = load_code_descriptions()
desc_dict = {k:v for k, v in desc_dict.items() if k in top_50_codes}

In [7]:
codes = codes[codes['HADM_ID'].isin(hadm_ids_total)].reset_index().drop(['index'], axis=1)
assert len(codes) == len(hadm_ids_total)

In [8]:
codes['LABELS'] = codes['LABELS'].apply(lambda x : ";".join(list(set(x.split(';')) & set(top_50_codes))))
labels = list(codes['LABELS'])

assert all([len(x.strip().split(';')) > 0  for x in labels])

def is_valid_code_set(x) :
    l = x.split(';')
    return len(set(l) - set(top_50_codes)) == 0

assert all([is_valid_code_set(x) for x in labels])

In [9]:
import spacy, re
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

def cleaner(text, spacy=True) :
    text = re.sub(r'\s+', ' ', text.strip())
    if spacy :
        text = [t.text.lower() for t in nlp(text)]
    else :
        text = [t.lower() for t in text.split()]
    text = " ".join(text)
    text = re.sub(r'\[\s*\*\s*\*(.*?)\*\s*\*\s*\]', ' <DE> ', text)
    text = re.sub(r'([^a-zA-Z0-9])(\s*\1\s*)+', r'\1 ', text)
#     text = re.sub(r'(\W)', r' \1 ', text)
    text = re.sub(r'\s+', ' ', text.strip())
    text = ['qqq' if any(char.isdigit() for char in word) else word for word in text.split(' ')]
    return " ".join(text)

In [11]:
from tqdm import tqdm_notebook

In [13]:
for k in desc_dict :
    desc_dict[k] = cleaner(desc_dict[k])

In [15]:
import vectorizer as v
Vectorizer = v.Vectorizer

In [16]:
vec = Vectorizer(min_df=5)

In [17]:
vec.fit(list(codes['TEXT']) + list(desc_dict.values()))

In [18]:
vec.label2idx = {k:i for i, k in enumerate(desc_dict.keys())}
vec.idx2label = {i:k for k, i in vec.label2idx.items()}
vec.label2desc = desc_dict
vec.code_desc = {k:vec.convert_to_sequence([x])[0] for k, x in desc_dict.items()}

In [19]:
vec.seqs = {}
vec.labels = {}
for k in keys :
    code_filtered = codes[codes['HADM_ID'].isin(hadm_ids[k])]
    vec.seqs[k] = vec.texts_to_sequences(list(code_filtered['TEXT']))
    labels = list(code_filtered['LABELS'])
    labels = [x.split(';') for x in labels]
    vec.labels[k] = [[vec.label2idx[y] for y in x] for x in labels]

In [20]:
import numpy as np
vec.label_one_hot = {}
for k in keys :
    vec.label_one_hot[k] = np.zeros((len(vec.seqs[k]), max(vec.idx2label.keys())+1))
    for i, x in enumerate(vec.labels[k]) :
        vec.label_one_hot[k][i, x] = 1
        
for k in keys :
    for i, x in enumerate(vec.labels[k]) :
        assert (set(np.where(vec.label_one_hot[k][i] == 1)[0]) == set(vec.labels[k][i]))

In [21]:
from gensim.models import KeyedVectors
model = KeyedVectors.load('MIMIC/mimic_embedding_model.wv')

In [22]:
vec.extract_embeddings(model)

Found 22312 words in model out of 22316


array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 6.07958188e-01,  7.56901864e-01, -4.58418838e-01, ...,
         3.27159178e-01,  1.05838462e+00, -1.00027972e+00],
       [ 3.20409388e-01, -9.84535349e-01, -1.66852078e+00, ...,
        -1.20682319e-01,  2.20697478e+00, -1.14211235e+00],
       ...,
       [ 1.16788733e+00, -9.24137831e-01,  1.75671399e+00, ...,
         9.75742284e-03, -2.87074357e-01, -2.45802402e-01],
       [ 9.31499293e-04, -9.84110713e-01, -1.11939907e+00, ...,
        -6.90321016e+00,  3.93742228e+00, -1.51261103e+00],
       [-7.20303178e-01, -1.40906677e-01, -2.27291822e+00, ...,
        -1.95523810e+00,  7.60727704e-01, -1.39611304e+00]])

In [23]:
import pickle
pickle.dump(vec, open('MIMIC/vec_icd9.p', 'wb'))