# Pull a sample of notes from MIMIC-III 

`NOTEEVENTS_DATA_TABLE.csv`

In [1]:
import re
import csv
import glob
import cPickle
import itertools
import numpy as np

## Map ShARe/CLEF corpus notes to MIMIC-III 

In [2]:
def get_clef_key(note):
    """
    Create ShARe/CLEF key
    """
    return note[1] + "_" + note[3] 
    
def get_mimic_key(note):
    """
    Create MIMIC-III key that maps to ShARe/CLEF note keys
    """
    description = note['DESCRIPTION'] if note['CATEGORY'] != "Radiology" else "REPORT"
    return "{}_{}_{}".format(note['SUBJECT_ID'], note['CATEGORY'], description).upper()

In [3]:
root_dir = "../data/corpora/clef/2014ShAReCLEFeHealthTasks2/training/"
filelist = glob.glob("{}/*.txt".format(root_dir))
clef_corpus = []

for fpath in filelist:
    doc = open(fpath,"rU").read()
    header = doc.splitlines()[0]
    doc = doc.lstrip(header)
    header = map(lambda x:x.split(), header.split("||||"))
    header = list(itertools.chain.from_iterable(header))
    clef_corpus.append([get_clef_key(header), doc])

clef_corpus = dict(clef_corpus)
print len(clef_corpus)

299


## Sample MIMIC-III documents
Here we use [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) to built an unlabeled dataset of 50,000 MIMIC-III critical care notes. 

In [4]:
csv_reader = csv.reader(open("/users/fries/NOTEEVENTS_DATA_TABLE.csv","rU"))

ignore_docs = cPickle.load(open("clef2mimic.pkl","rb"))

n_samples = 50000
mimic_corpus = []
np.random.seed(1234)

for i,row in enumerate(csv_reader):
    if i == 0:
        header = row
        continue
    
    row = dict(zip(header,row))
    
    # skip documents that are identical to our ShARe/CLEF dev/test set
    key = get_mimic_key(row)
    if key in ignore_docs:
        continue
        
    # reservoir sampling
    if len(mimic_corpus) < n_samples:
        mimic_corpus.append(row)
    else:
        j = np.random.randint(0, i+1)
        if j < n_samples:
            mimic_corpus[j] = row
            
print len(mimic_corpus)

50000


## Remove Duplicate / Near identical MIMIC Notes
ShARe/CLEF notes are built from MIMIC-II, so there is some overlap with MIMIC-III data. Here we remove note to make certain our dev/test sets contain unseen data.

In [5]:
# build our document strings
doc_ids, corpus = [],[]
for note in mimic_corpus:
    doc_ids.append(get_mimic_key(note))
    corpus.append(note["TEXT"])
    
for key in clef_corpus:
    doc_ids.append(key)
    corpus.append(clef_corpus[key])    

print len(corpus)

50299


### Create Bag of Words Representation

In [6]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer

def tokenize(s):
    tokens = s.split()
    # remove MIMIC anonymization terms of the form [**LastName**]
    return [t for t in tokens if not re.search("\[\*\*(.+)\*\*\]|\[\*\*(.+)|(.+)\*\*\]", t)]

vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)

X_norm = normalize(X, norm='l2', axis=1)

In [7]:
# clef and mimic ids
query_ids = np.array(doc_ids[-299:])
search_ids = np.array(doc_ids[:-299])

# cosine similarity
q = X_norm[-299:]
d = X_norm[:-299]
w = q.dot(d.T)

### Remove MIMIC-III notes

In [8]:
rm = []
for i, query_doc_id in enumerate(query_ids):
    w_hat = np.ravel(w[i].todense())
    idxs = int(np.argsort(-w_hat)[:1])
    score = w_hat[idxs]
    if score > 0.97:
        rm.append((search_ids[idxs], query_doc_id, score))

print "Found {} near-duplicates".format(len(rm))

Found 0 near-duplicates


In [9]:
for x in rm:
    ignore_docs[x[0]] = 1
    print x
print len(ignore_docs)

231


In [10]:
cPickle.dump(ignore_docs, open("clef2mimic.pkl","w"))

### Export MIMIC Training Sample

In [14]:
import os
outdir = "../data/corpora/mimic-iii/training/"
if not os.path.exists(outdir):
    os.mkdir(outdir)

for id,doc in zip(doc_ids[:-299], corpus[:-299]):
    fpath = "{}/{}.txt".format(outdir, id.replace("/","_"))
    with open(fpath,"w") as fp:
        fp.write(doc)