In [1]:
import pandas as pd

In [2]:
codes = pd.read_csv('MIMIC/readmissions_readmit.csv').dropna()

In [3]:
import spacy, re
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

def cleaner(text, spacy=True) :
    text = re.sub(r'\s+', ' ', text.strip())
    if spacy :
        text = [t.text.lower() for t in nlp(text)]
    else :
        text = [t.lower() for t in text.split()]
    text = " ".join(text)
    text = re.sub(r'\[\s*\*\s*\*(.*?)\*\s*\*\s*\]', ' DEIDENTIFY ', text)
    text = re.sub(r'([^a-zA-Z0-9])(\s*\1\s*)+', r'\1 ', text)
    text = re.sub(r'(\W)', r' \1 ', text)
    text = re.sub(r'\s+', ' ', text.strip())
    text = ['qqq' if any(char.isdigit() for char in word) else word for word in text.split(' ')]
    return " ".join(text)

In [4]:
texts = list(codes['text'])

In [5]:
from tqdm import tqdm_notebook

In [8]:
for i in tqdm_notebook(range(len(texts)), miniters=100) :
    texts[i] = cleaner(texts[i])

HBox(children=(IntProgress(value=0, max=8411), HTML(value='')))




In [9]:
codes['text'] = texts
codes.to_csv('MIMIC/cleaned_readmit_only_data.csv', index=False)

In [21]:
import vectorizer as v
Vectorizer = v.Vectorizer

In [22]:
vec = Vectorizer(min_df=5)

In [23]:
vec.fit(list(codes['text']))

In [24]:
from sklearn.model_selection import train_test_split
keys = {}
keys['train'], keys['test'] = train_test_split(list(codes['hadm_id']), stratify=list(codes['readmission_30']), test_size = 0.1, random_state=1034)
# keys['train'], keys['dev'] = train_test_split(keys['train'], test_size=0.1, random_state=1035)

In [25]:
vec.seqs = {}
vec.labels = {}
for k in keys :
    code_filtered = codes[codes['hadm_id'].isin(keys[k])]
    vec.seqs[k] = vec.texts_to_sequences(list(code_filtered['text']))
    vec.labels[k] = list(code_filtered['readmission_30'])

In [26]:
from gensim.models import Word2Vec

In [16]:
sentences = [[vec.idx2word[x] for x in y] for k in vec.seqs for y in vec.seqs[k]]

In [17]:
model = Word2Vec(sentences, size=200, window=10, min_count=1, workers=5)

In [18]:
model.train(sentences, total_examples=len(sentences), epochs=10)

(129461018, 201966080)

In [27]:
vec.extract_embeddings(model)

Found 18685 words in model out of 18686


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.06836064,  0.20034975,  0.56518489, ...,  0.31307557,
        -1.59697318, -1.1097424 ],
       [-6.25817823, -4.71902704, 10.04657364, ..., -7.18311071,
         0.85669881, -1.98961699],
       ...,
       [-0.24864443, -1.03268707,  0.57694995, ...,  1.15038824,
        -0.97294104,  0.32838869],
       [-0.28442943,  0.44261733, -2.66807628, ...,  0.7622484 ,
        -3.90297222,  1.01426923],
       [-1.2994324 ,  0.57265449,  0.35146809, ..., -0.7985571 ,
         1.21987605,  1.99669313]])

In [28]:
import pickle
pickle.dump(vec, open('MIMIC/vec_admit_only.p', 'wb'))