# Sample Notes from MIMIC-III v1.4



In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.insert(0,'../../ehr-rwe/')

import glob
import collections
import numpy as np
import pandas as pd
import hashlib

MIMIC_DATASET_ROOT = '/Users/fries/'


## Load Annotated Notes

We load out set of annotated note IDs to guarantee our unlabeled set is disjoint

In [2]:
%%time
import collections

# load annotated files
filelist = glob.glob('/Users/fries/Desktop/npj-dev-surveillance-MIMIC-III/v2/jason/*')
filelist = [fp.split("/")[-1].split('.')[0].split('_')
            for fp in filelist if fp.split('.')[-1] not in ('ann','conf')]

annotations = dict.fromkeys([tuple([int(v) if v != 'nan' else np.nan for v in fp]) for fp in filelist])
annotations = {key[0] for key in annotations}
print(len(annotations))

annotation_subset = {}

# dump TSV
fname = f"{MIMIC_DATASET_ROOT}/NOTEEVENTS.csv.gz"
for chunk in pd.read_csv(fname, sep=',', compression='infer', chunksize=10000):
    for row in chunk.itertuples():
        digest = hashlib.md5(row.TEXT.encode("utf-8")).digest()
        #key = (row.ROW_ID, row.SUBJECT_ID, row.HADM_ID, digest)
           
        if row.ROW_ID in annotations:
            annotation_subset[row.ROW_ID] = row
        

print(len(annotation_subset))


298
298
CPU times: user 1min, sys: 3.59 s, total: 1min 4s
Wall time: 1min 4s


In [22]:
subset_final = glob.glob('/Users/fries/Desktop/npj-subset/v2/jason/*.ann')
subset_final = [int(fp.split('/')[-1].split('.')[0].split('_')[0]) for fp in subset_final]

with open('/Users/fries/Desktop/RELEASE-NPJ/gold.mimic.row_ids.tsv', 'w') as fp:
    for x in sorted(subset_final):
        fp.write(f'{x}\n')

        
        

with open('/Users/fries/Desktop/RELEASE-NPJ/holdout.mimic.row_ids.tsv', 'w') as fp:
    for x in sorted(annotation_subset):
        fp.write(f'{x}\n')

        

In [4]:
from prep_mimic import preprocess

tsv = []
for row_id in annotation_subset:
    row = annotation_subset[row_id]
    key = (row.ROW_ID, row.SUBJECT_ID, row.HADM_ID)
    digest = hashlib.md5(row.TEXT.encode("utf-8")).digest()
    chart_date = row.CHARTDATE
    chart_time = row.CHARTTIME
    category = row.CATEGORY
    desc = row.DESCRIPTION
    text = row.TEXT
    
    text = preprocess(text, preserve_offsets=True)
    text = text.replace('\n', '\\n').replace('\t', '\\t')
    tsv.append((row.ROW_ID, row.SUBJECT_ID, row.HADM_ID, 
                row.CHARTDATE, row.CHARTTIME, row.CATEGORY, 
               row.DESCRIPTION, text))
    
# dump to TSV   
outfpath = '/Users/fries/Desktop/RELEASE-NPJ/gold/annotated_corpus.tsv'
with open(outfpath, 'w') as fp:
    fp.write('\t'.join(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', "CHARTDATE", 
                        "CHARTTIME", "CATEGORY", "DESCRIPTION", 'TEXT']) + '\n')
    for row in tsv:
        row = list(map(str, row))
        fp.write('\t'.join(row) + '\n')
        

136
[2149, 2149, 1668, 2149, 2097, 2149, 2149, 2149, 2149, 2149, 2149, 2149, 2149, 2149, 2149, 2149, 2149, 2149, 2149, 2149]
ERROR - normed date 1532 below 1900


## Query Notes

We generate a target annotation set by looking for documents where:
- Arguments occur within `max_dist` tokens of each other for both `PAIN-AT` and `IMPLANT-COMPLICATION` relations.
- Both relation types occur in the same document.

Individually, each relational candidate has approx this prevelance rate per 10,000 documents

- 2.41% `IMPLANT-COMPLICATION` 
- 31.1% `PAIN-AT` 


In [7]:
import re
import string
from rwe.utils import load_dict

def tokenize(s):
    toks = re.split(r'''([''' + string.punctuation + ''']|(\s|\n)+)''', s)
    return [t.strip().lower() for t in toks if t and t.strip()]

def match_query(queries, tokens, max_ngrams=4):
    for i in range(len(tokens)):
        for j in range(i+1, min(len(tokens) + 1, i + 1 + max_ngrams)):
            t = ' '.join(tokens[i:j])
            if t in queries:
                return (True, t, i)
    return (False, None, -1)
    
def match_relation(tokens, concept1, concept2, max_dist=25, verbose=False):
    h1,t1,i = match_query(concept1, tokens)
    if not h1:
        return False
    h2,t2,j = match_query(concept2, tokens)
    if not h2:
        return False
    if abs(i-j) < max_dist:
        if verbose:
            print(t1, t2, abs(i-j))
        return True
    return False
    

root_dir = "../data/dicts/"
stopwords = {'head'}

concepts = {
    'implants': load_dict(f'{root_dir}/implants/implants.txt', stopwords=stopwords),
    'complications':load_dict(f'{root_dir}/implants/implant_complications.txt'),
    'anatomy':load_dict(f'{root_dir}/anatomy/anat.bz2'),
    'pain':load_dict(f'{root_dir}/pain/pain.txt')
}


In [36]:
%%time

hits = {}

n_samples = None #10000
max_dist = 25
target = ('implants', 'complications')
target = ('anatomy', 'pain')

fname = "/Users/fries/NOTEEVENTS.csv.gz"
for chunk in pd.read_csv(fname, sep=',', compression='infer', chunksize=10000):
    for row in chunk.itertuples():
        digest = hashlib.md5(row.TEXT.encode("utf-8")).digest()
        key = (row.ROW_ID, row.SUBJECT_ID, row.HADM_ID, digest)
        
        if key[0] in annotation_subset:
            print(f'skipping {key[0]}')
            continue
        
        if key in hits:
            continue
            
        toks = tokenize(row.TEXT)
        #v = match_relation(toks, concepts['implants'], concepts['complications'], max_dist)
        #if not v:
        #    continue
        v = match_relation(toks, concepts['anatomy'], concepts['pain'], max_dist)
        if v:
            hits[key] = row.TEXT

    if n_samples != None and len(hits) > n_samples:
        break

print('hits', len(hits))

skipping 718
skipping 765
skipping 463
skipping 501
skipping 510
skipping 519
skipping 271
skipping 361
skipping 838
skipping 600
skipping 1438
skipping 990
skipping 1108
skipping 1254
skipping 1711
skipping 1519
skipping 1574
skipping 1584
skipping 1679
skipping 2322
skipping 1946
skipping 1960
skipping 1725
skipping 1757
skipping 1843
skipping 1880
skipping 1885
skipping 2495
skipping 2364
skipping 2604
skipping 2899
skipping 2902
skipping 3641
skipping 3643
skipping 3674
skipping 4199
skipping 4074
skipping 4082
skipping 3533
skipping 4262
skipping 4410
skipping 4656
skipping 4898
skipping 4935
skipping 5094
skipping 5167
skipping 5040
skipping 5370
skipping 5425
skipping 5432
skipping 5433
skipping 5557
skipping 8689
skipping 7123
skipping 12268
skipping 6599
skipping 6645
skipping 7317
skipping 13167
skipping 10433
skipping 5928
skipping 5766
skipping 5782
skipping 5789
skipping 5811
skipping 6693
skipping 6699
skipping 10338
skipping 8038
skipping 9362
skipping 8079
skipping 6945

In [37]:
len(hits)

133384

In [39]:
with open('/Users/fries/Desktop/RELEASE-NPJ/unlabeled.pain.all.mimic.row_ids.tsv','w') as fp:
    for key in hits:
        fp.write(f"{key[0]}\n")


## Export Documents

In [None]:
outdir = '/Users/fries/Desktop/mimic-sample/'

subjects = {}
for key in hits:
    row_id, subject_id, hadm_id, _ = key
    if subject_id in subjects:
        continue
    subjects[subject_id] = 1
    fname = f'{outdir}/{row_id}_{subject_id}_{hadm_id}.txt'
    with open(fname,'w') as fp:
        fp.write(hits[key])
print(len(subjects))

## Export DocTimes

In [None]:
from datetime import datetime

s = '2116-02-07 14:08:00'


In [None]:
%%time
import numpy as np
import pandas as pd
import collections 

doc_times = {}

fname = "/Users/fries/NOTEEVENTS.csv.gz"
for chunk in pd.read_csv(fname, sep=',', compression='infer', chunksize=10000):
    for i,row in enumerate(chunk.itertuples()):
        chart_ts = str(row.CHARTTIME)
        if chart_ts != 'nan':
            ts = datetime.strptime(chart_ts, '%Y-%m-%d %H:%M:%S')
        else:
            chart_ts = str(row.CHARTDATE)
            ts = datetime.strptime(chart_ts, '%Y-%m-%d')
        key = (row.ROW_ID, row.SUBJECT_ID, row.HADM_ID)
        doc_times['_'.join(map(str,key))] = ts

print(len(doc_times))





In [None]:
import pickle
pickle.dump(doc_times, open('/users/fries/desktop/mimic-doctimes.pkl', 'wb'))

## Preprocess Notes

## Tag Candidates

In [None]:
from rwe import dataloader

inputdir = '/Users/fries/Desktop/sample/'
corpus = dataloader(glob.glob(f'{inputdir}/*.json'))
print(f'Loaded {len(corpus)} documents')


In [None]:
from rwe.utils import load_dict
from rwe.labelers.taggers import (
    ResetTags, RelationTagger, 
    DictionaryTagger, NegExTagger, HypotheticalTagger, 
    SectionHeaderTagger, ParentSectionTagger
)

dict_pain = load_dict('../data/dicts/pain/pain.txt')
dict_anat = load_dict('../data/dicts/anatomy/anat.bz2')
dict_impl = load_dict('../data/dicts/implants/implants.txt')
dict_comp = load_dict('../data/dicts/implants/implant_complications.txt')

pipeline = {
    # clear any previous runs
    "reset"        : ResetTags(),
    
    # concepts
    "pain"         : DictionaryTagger({'pain': dict_pain}),
    "anatomy"      : DictionaryTagger({'anatomy': dict_anat}),
    "implant"      : DictionaryTagger({'implant': dict_impl}),
    "complication" : DictionaryTagger({'complication': dict_comp}),
    "sections"     : SectionHeaderTagger(),
    
    # attributes / primitives
    "negation"     : NegExTagger(targets=['pain'], data_root="../data/dicts/negex/"),
    "hypothetical" : HypotheticalTagger(targets=['pain']),
    "headers"      : ParentSectionTagger(targets=['pain']),
    
    # relations
    "pain-at"         : RelationTagger('pain-at', ('pain', 'anatomy')),
    "complication-at" : RelationTagger('comp-at', ('complication', 'implant')),
}


In [None]:
%%time
from rwe.labelers import TaggerPipelineServer

tagger = TaggerPipelineServer(num_workers=4)
documents = tagger.apply(pipeline, [corpus])


In [None]:
from rwe.utils import build_candidate_set

Xs_pain_at = build_candidate_set(documents[0], "pain-at")
Xs_comp_at = build_candidate_set(documents[0], "comp-at")

In [None]:
def collapse_relation_args(relations):
    return set([s for x in relations for s in x])
    
pain_at_spans = collapse_relation_args(Xs_pain_at)
comp_at_spans = collapse_relation_args(Xs_comp_at)

doc_span_index = collections.defaultdict(set)
for s in comp_at_spans:
    doc_span_index[s.sentence.document.name].add(s)
for s in pain_at_spans:
    doc_span_index[s.sentence.document.name].add(s)
    
n = 0
for doc_name in doc_span_index:
    print(doc_name, len(doc_span_index[doc_name]))
    n += len(doc_span_index[doc_name])
print(n)
    

In [None]:
outdir = '/Users/fries/Desktop/mimic-sample-brat/'
etype = 'Concept'

for doc_name in doc_span_index:
    outfname = f'{outdir}/{doc_name}.ann'
    with open(outfname, 'w') as fp:
        items = set()
        for i,s in enumerate(doc_span_index[doc_name]):
            # T8	Concept 468 479;480 489	right lower extremity
            multi_spans = []
            start = s.abs_char_start
            if '\n' in s.text:
                toks = s.text.split("\n")
                for t in toks:
                    multi_spans.append((start, start + len(t)))
                    start += len(t) + 1
                
                span_str = [f'{span[0]} {span[1]}' for span in multi_spans]
                anno = (etype, " ", ";".join(span_str), "\t", s.text.replace("\n", " "))
            else:
                anno = (etype, " ", f"{s.abs_char_start} {s.abs_char_end+1}", "\t", s.text.replace("\n", " "))
                
            items.add(anno)
            
        for i,s in enumerate(sorted(items,key=lambda x:x[1], reverse=0)):
            anno = f'T{i+1}\t{"".join(s)}'
            print(anno)
            fp.write(anno + '\n')
