In [1]:
import pandas as pd

In [2]:
codes = pd.read_csv('../caml-mimic/mimicdata/mimic3/notes_labeled.csv').dropna()

In [3]:
from collections import defaultdict
import csv
def reformat(code, is_diag):
    code = ''.join(code.split('.'))
    if is_diag:
        if code.startswith('E'):
            if len(code) > 4:
                code = code[:4] + '.' + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + '.' + code[3:]
    else:
        code = code[:2] + '.' + code[2:]
    return code

def load_code_descriptions(version='mimic3'):
    desc_dict = defaultdict(str)
    DATA_DIR = '../caml-mimic/mimicdata'
    with open("%s/D_ICD_DIAGNOSES.csv" % (DATA_DIR), 'r') as descfile:
        r = csv.reader(descfile)
        next(r)
        for row in r:
            code = row[1]
            desc = row[-1]
            desc_dict[reformat(code, True)] = desc
            
    with open("%s/D_ICD_PROCEDURES.csv" % (DATA_DIR), 'r') as descfile:
        r = csv.reader(descfile)
        next(r)
        for row in r:
            code = row[1]
            desc = row[-1]
            if code not in desc_dict.keys():
                desc_dict[reformat(code, False)] = desc
                
    with open('%s/ICD9_descriptions' % DATA_DIR, 'r') as labelfile:
        for i,row in enumerate(labelfile):
            row = row.rstrip().split()
            code = row[0]
            if code not in desc_dict.keys():
                desc_dict[code] = ' '.join(row[1:])
    return desc_dict

In [4]:
desc_dict = load_code_descriptions()

In [5]:
for k in desc_dict :
    if '285' in k : print(k, desc_dict[k])

285.0 Sideroblastic anemia
285.1 Acute posthemorrhagic anemia
285.21 Anemia in chronic kidney disease
285.22 Anemia in neoplastic disease
285.29 Anemia of other chronic disease
285.3 Antineoplastic chemotherapy induced anemia
285.8 Other specified anemias
285.9 Anemia, unspecified
285 Other and unspecified anemias
285.2 Anemia of chronic disease


In [7]:
def has_label(x, positive_icd9) :
    return any([positive_icd9 in y for y in x.split(';')])

In [8]:
codes['has_c1'] = codes['LABELS'].apply(lambda x : has_label(x, '285.1'))
codes['has_c2'] = codes['LABELS'].apply(lambda x : has_label(x, '285.2'))

In [9]:
codes.groupby(['has_c1', 'has_c2']).size()

has_c1  has_c2
False   False     45661
        True       2562
True    False      4283
        True        216
dtype: int64

In [8]:
data_for_pos_label = codes[codes['has_pos'] == True]
data_for_neg_label = codes[codes['has_pos'] == False].sample(len(data_for_pos_label), random_state=14829)

In [9]:
data = pd.concat([data_for_neg_label, data_for_pos_label]).reset_index(drop=True)

In [10]:
import spacy, re
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

def cleaner(text, spacy=True) :
    text = re.sub(r'\s+', ' ', text.strip())
    if spacy :
        text = [t.text.lower() for t in nlp(text)]
    else :
        text = [t.lower() for t in text.split()]
    text = " ".join(text)
    text = re.sub(r'\[\s*\*\s*\*(.*?)\*\s*\*\s*\]', ' <DE> ', text)
    text = re.sub(r'([^a-zA-Z0-9])(\s*\1\s*)+', r'\1 ', text)
    text = re.sub(r'\s+', ' ', text.strip())
    text = ['qqq' if any(char.isdigit() for char in word) else word for word in text.split(' ')]
    return " ".join(text)

In [11]:
texts = list(data['TEXT'])

In [12]:
from tqdm import tqdm_notebook

In [13]:
for i in tqdm_notebook(range(len(texts)), miniters=100) :
    texts[i] = cleaner(texts[i])

HBox(children=(IntProgress(value=0, max=6694), HTML(value='')))




In [14]:
for k in desc_dict :
    desc_dict[k] = cleaner(desc_dict[k])

In [15]:
del codes
data['TEXT'] = texts
data.to_csv('/media/sarthak/data/projects/Transparency/preprocess/MIMIC/cleaned_data' + positive_icd9 + '.csv', index=False)

In [16]:
import vectorizer as v
Vectorizer = v.Vectorizer

In [17]:
vec = Vectorizer(min_df=5)

In [18]:
vec.fit(list(data['TEXT']) + list(desc_dict.values()))

In [19]:
vec.label2idx = {k:i for i, k in enumerate(desc_dict.keys())}
vec.idx2label = {i:k for k, i in vec.label2idx.items()}
vec.label2desc = desc_dict
vec.code_desc = {k:vec.convert_to_sequence([x])[0] for k, x in desc_dict.items()}

In [20]:
from sklearn.model_selection import train_test_split
idxs = {}
idxs['train'], idxs['test'] = train_test_split(data.index, stratify=data['has_pos'], test_size=0.2, random_state=12939)

In [21]:
keys = ['train', 'test']
import numpy as np
vec.seq_text = {}
vec.label = {}
for k in keys :
    filtered = data[data.index.isin(idxs[k])]
    vec.seq_text[k] = vec.texts_to_sequences(list(data['TEXT']))
    vec.label[k] = np.where(list(data['has_pos']), 1, 0)

In [22]:
from gensim.models import Word2Vec

In [23]:
sentences = [[vec.idx2word[x] for x in y] for k in vec.seq_text for y in vec.seq_text[k]]
sentences += [[vec.idx2word[x] for x in y] for y in vec.code_desc.values()]

In [24]:
model = Word2Vec(sentences, size=200, window=10, min_count=1, workers=5)

In [25]:
model.train(sentences, total_examples=len(sentences), epochs=10)

(192620926, 286996240)

In [26]:
vec.extract_embeddings(model)

Found 18937 words in model out of 18938


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.21415395,  0.17256159,  0.22251911, ..., -1.00560653,
        -0.65924704, -1.60224962],
       [ 0.96354502,  5.50281048, -0.25558323, ...,  2.81642532,
         2.47439027,  2.4510026 ],
       ...,
       [ 4.68103075, -0.09528357,  0.07303315, ..., -0.73452204,
        -2.78040624, -1.82796764],
       [ 2.63897204,  0.53498065, -0.54808551, ..., -4.52820301,
        -8.33251095, -2.8496809 ],
       [ 0.97723985, -0.05751874,  0.95762306, ..., -2.89463377,
        -3.56042004, -1.29545152]])

In [27]:
import pickle
pickle.dump(vec, open('MIMIC/vec_icd9_' + positive_icd9 + '.p', 'wb'))