In [1]:
import pandas as pd

In [2]:
codes = pd.read_csv('MIMIC/cleaned_data_full.csv').dropna()

In [3]:
from collections import defaultdict
import csv
def reformat(code, is_diag):
    code = ''.join(code.split('.'))
    if is_diag:
        if code.startswith('E'):
            if len(code) > 4:
                code = code[:4] + '.' + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + '.' + code[3:]
    else:
        code = code[:2] + '.' + code[2:]
    return code

def load_code_descriptions(version='mimic3'):
    desc_dict = defaultdict(str)
    DATA_DIR = '../caml-mimic/mimicdata'
    with open("%s/D_ICD_DIAGNOSES.csv" % (DATA_DIR), 'r') as descfile:
        r = csv.reader(descfile)
        next(r)
        for row in r:
            code = row[1]
            desc = row[-1]
            desc_dict[reformat(code, True)] = desc
            
    with open("%s/D_ICD_PROCEDURES.csv" % (DATA_DIR), 'r') as descfile:
        r = csv.reader(descfile)
        next(r)
        for row in r:
            code = row[1]
            desc = row[-1]
            if code not in desc_dict.keys():
                desc_dict[reformat(code, False)] = desc
                
    with open('%s/ICD9_descriptions' % DATA_DIR, 'r') as labelfile:
        for i,row in enumerate(labelfile):
            row = row.rstrip().split()
            code = row[0]
            if code not in desc_dict.keys():
                desc_dict[code] = ' '.join(row[1:])
    return desc_dict

In [4]:
desc_dict = load_code_descriptions()

In [5]:
import spacy, re
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

def cleaner(text, spacy=True) :
    text = re.sub(r'\s+', ' ', text.strip())
    if spacy :
        text = [t.text.lower() for t in nlp(text)]
    else :
        text = [t.lower() for t in text.split()]
    text = " ".join(text)
    text = re.sub(r'\[\s*\*\s*\*(.*?)\*\s*\*\s*\]', ' <DE> ', text)
    text = re.sub(r'([^a-zA-Z0-9])(\s*\1\s*)+', r'\1 ', text)
    text = re.sub(r'\s+', ' ', text.strip())
    text = ['qqq' if any(char.isdigit() for char in word) else word for word in text.split(' ')]
    return " ".join(text)

In [6]:
texts = list(codes['TEXT'])

In [7]:
from tqdm import tqdm_notebook

In [8]:
for i in tqdm_notebook(range(len(texts)), miniters=100) :
    texts[i] = cleaner(texts[i])

In [9]:
for k in desc_dict :
    desc_dict[k] = cleaner(desc_dict[k])

In [10]:
codes['TEXT'] = texts
codes.to_csv('/media/sarthak/data/projects/Transparency/preprocess/MIMIC/cleaned_data_full.csv', index=False)

In [11]:
from gensim.models import Word2Vec

In [12]:
sentences = [x.split(' ') for x in texts]
sentences += [x.split(' ') for x in desc_dict.values()]

In [13]:
model = Word2Vec(sentences, size=200, window=10, min_count=2, workers=5)

In [14]:
model.train(sentences, total_examples=len(sentences), epochs=10)

(695670798, 1035871840)

In [15]:
model.wv.save("MIMIC/mimic_embedding_model.wv")