In [1]:
import pandas as pd

In [2]:
codes = pd.read_csv('../caml-mimic/mimicdata/mimic3/notes_labeled.csv').dropna()

In [3]:
keys = ['train', 'dev', 'test']
hadm_ids = {x:list(pd.read_csv('../caml-mimic/mimicdata/mimic3/%s_50_hadm_ids.csv'%x, header=None)[0]) for x in keys}
hadm_ids_total = hadm_ids['train'] + hadm_ids['dev'] + hadm_ids['test']

top_50_codes = list(pd.read_csv('../caml-mimic/mimicdata/mimic3/TOP_50_CODES.csv', header=None)[0])

In [4]:
from collections import defaultdict
import csv
def reformat(code, is_diag):
    code = ''.join(code.split('.'))
    if is_diag:
        if code.startswith('E'):
            if len(code) > 4:
                code = code[:4] + '.' + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + '.' + code[3:]
    else:
        code = code[:2] + '.' + code[2:]
    return code

def load_code_descriptions(version='mimic3'):
    desc_dict = defaultdict(str)
    DATA_DIR = '../caml-mimic/mimicdata'
    with open("%s/D_ICD_DIAGNOSES.csv" % (DATA_DIR), 'r') as descfile:
        r = csv.reader(descfile)
        next(r)
        for row in r:
            code = row[1]
            desc = row[-1]
            desc_dict[reformat(code, True)] = desc
            
    with open("%s/D_ICD_PROCEDURES.csv" % (DATA_DIR), 'r') as descfile:
        r = csv.reader(descfile)
        next(r)
        for row in r:
            code = row[1]
            desc = row[-1]
            if code not in desc_dict.keys():
                desc_dict[reformat(code, False)] = desc
                
    with open('%s/ICD9_descriptions' % DATA_DIR, 'r') as labelfile:
        for i,row in enumerate(labelfile):
            row = row.rstrip().split()
            code = row[0]
            if code not in desc_dict.keys():
                desc_dict[code] = ' '.join(row[1:])
    return desc_dict

In [5]:
desc_dict = load_code_descriptions()
desc_dict = {k:v for k, v in desc_dict.items() if k in top_50_codes}

In [6]:
codes = codes[codes['HADM_ID'].isin(hadm_ids_total)].reset_index().drop(['index'], axis=1)
assert len(codes) == len(hadm_ids_total)

In [7]:
codes['LABELS'] = codes['LABELS'].apply(lambda x : ";".join(list(set(x.split(';')) & set(top_50_codes))))
labels = list(codes['LABELS'])

assert all([len(x.strip().split(';')) > 0  for x in labels])

def is_valid_code_set(x) :
    l = x.split(';')
    return len(set(l) - set(top_50_codes)) == 0

assert all([is_valid_code_set(x) for x in labels])

In [8]:
import spacy, re
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

def cleaner(text, spacy=True) :
    text = re.sub(r'\s+', ' ', text.strip())
    if spacy :
        text = [t.text.lower() for t in nlp(text)]
    else :
        text = [t.lower() for t in text.split()]
    text = " ".join(text)
    text = re.sub(r'\[\s*\*\s*\*(.*?)\*\s*\*\s*\]', ' <DE> ', text)
    text = re.sub(r'([^a-zA-Z0-9])(\s*\1\s*)+', r'\1 ', text)
#     text = re.sub(r'(\W)', r' \1 ', text)
    text = re.sub(r'\s+', ' ', text.strip())
    text = ['qqq' if any(char.isdigit() for char in word) else word for word in text.split(' ')]
    return " ".join(text)

In [9]:
texts = list(codes['TEXT'])

In [10]:
from tqdm import tqdm_notebook

In [11]:
for i in tqdm_notebook(range(len(texts)), miniters=100) :
    texts[i] = cleaner(texts[i])

HBox(children=(IntProgress(value=0, max=11368), HTML(value='')))




In [12]:
for k in desc_dict :
    desc_dict[k] = cleaner(desc_dict[k])

In [13]:
codes['TEXT'] = texts
codes.to_csv('/media/sarthak/data/projects/Transparency/preprocess/MIMIC/cleaned_data.csv', index=False)

In [14]:
import vectorizer as v
Vectorizer = v.Vectorizer

In [15]:
vec = Vectorizer(min_df=5)

In [16]:
vec.fit(list(codes['TEXT']) + list(desc_dict.values()))

In [17]:
vec.label2idx = {k:i for i, k in enumerate(desc_dict.keys())}
vec.idx2label = {i:k for k, i in vec.label2idx.items()}
vec.label2desc = desc_dict
vec.code_desc = {k:vec.convert_to_sequence([x])[0] for k, x in desc_dict.items()}

In [18]:
vec.seqs = {}
vec.labels = {}
for k in keys :
    code_filtered = codes[codes['HADM_ID'].isin(hadm_ids[k])]
    vec.seqs[k] = vec.texts_to_sequences(list(code_filtered['TEXT']))
    labels = list(code_filtered['LABELS'])
    labels = [x.split(';') for x in labels]
    vec.labels[k] = [[vec.label2idx[y] for y in x] for x in labels]

In [19]:
import numpy as np
vec.label_one_hot = {}
for k in keys :
    vec.label_one_hot[k] = np.zeros((len(vec.seqs[k]), max(vec.idx2label.keys())+1))
    for i, x in enumerate(vec.labels[k]) :
        vec.label_one_hot[k][i, x] = 1
        
for k in keys :
    for i, x in enumerate(vec.labels[k]) :
        assert (set(np.where(vec.label_one_hot[k][i] == 1)[0]) == set(vec.labels[k][i]))

In [20]:
from gensim.models import Word2Vec

In [21]:
sentences = [[vec.idx2word[x] for x in y] for k in vec.seqs for y in vec.seqs[k]]
sentences += [[vec.idx2word[x] for x in y] for y in vec.code_desc.values()]

In [22]:
model = Word2Vec(sentences, size=200, window=10, min_count=1, workers=5)

In [23]:
model.train(sentences, total_examples=len(sentences), epochs=10)

(160234038, 238430380)

In [24]:
vec.extract_embeddings(model)

Found 22315 words in model out of 22316


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.33724129, -0.72580487,  0.25685626, ...,  0.84722298,
        -0.42704654,  0.16344222],
       [ 1.36212325, -3.3724916 ,  1.49810886, ..., -4.18783808,
         3.35530281, -0.17716061],
       ...,
       [ 0.39726099,  0.06424714,  0.01182108, ...,  0.05343216,
         0.01368389, -0.1997826 ],
       [ 3.05400419, -0.07490952,  4.42545509, ...,  3.40820122,
        -1.88489342, -1.60525978],
       [ 0.53760332, -1.25615954,  0.02455853, ...,  0.53178424,
        -1.84809422, -0.37117359]])

In [25]:
import pickle
pickle.dump(vec, open('MIMIC/vec_icd9.p', 'wb'))