This notebook requires the MIMIC v1.4 `NOTEEVENTS.csv.gz` file

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.insert(0,'../../ehr-rwe/')

import glob
import collections
import numpy as np
import pandas as pd
import hashlib
import datetime

MIMIC_DATASET_ROOT = '/Users/fries/'
ANNO_DATASET_ROOT = '/Users/fries/Desktop/RELEASE-NPJ/'


## Export TSV Patient Notes

In PHI settings, we have access to real timestamsp that our labeling functions can use to infer relevative time offsets of clinical events. However MIMIC notes blind dates with special markup tokens for the form `[**2988-12-24**]`. As a preprocessing step, we replace these special tokens with realistic-looking dates so that we can use our date-aware labeling functions. 


In [11]:
%%time
def load_clinical_notes(row_ids, mimic_fpath):
    # load row_id notes
    docs = {}
    for chunk in pd.read_csv(mimic_fpath, sep=',', compression='infer', chunksize=10000):
        for row in chunk.itertuples():
            #digest = hashlib.md5(row.TEXT.encode("utf-8")).digest()
            try:
                if row.ROW_ID in row_ids:
                    docs[row.ROW_ID] = row
            except Exception as e:
                print(e, row.ROW_ID)
                
    return docs

def load_row_ids(fpath):
    return set([int(x) for x in open(fpath, 'r').read().splitlines()])

dataset = {
    'gold': 'gold.mimic.row_ids.tsv',
    'unlabeled': 'unlabeled.pain_complications.mimic.row_ids.tsv'
}

for name in dataset:
    dataset[name] = load_row_ids(f'{ANNO_DATASET_ROOT}/{dataset[name]}')

mimic_fpath = f"{MIMIC_DATASET_ROOT}/NOTEEVENTS.csv.gz"
for name in dataset:
    dataset[name] = load_clinical_notes(dataset[name], mimic_fpath)
    print(f'{name} {len(dataset[name])}')

gold 55
unlabeled 1322
CPU times: user 1min 42s, sys: 6.58 s, total: 1min 49s
Wall time: 1min 49s


In [12]:
%%time
from datetime import datetime, timedelta
from prep_mimic import preprocess

def process_clinical_notes(docs):
    for row_id in docs:
        row = docs[row_id]
        row = {name:getattr(row, name) for name in row._fields if name != 'Index'}
        
        # convert timestamps
        chartdate = None if type(row['CHARTDATE']) is not str else datetime.strptime(row['CHARTDATE'], '%Y-%m-%d')
        charttime = None if type(row['CHARTTIME']) is not str else datetime.strptime(row['CHARTTIME'], '%Y-%m-%d %H:%M:%S')
        
        # get structured chart time
        doc_ts = charttime.year if charttime else chartdate.year
        
        # convert note text
        text, tdelta = preprocess(row["TEXT"], doc_ts=doc_ts, preserve_offsets=True)
        # escape whitespace
        text = text.replace('\n', '\\n').replace('\t', '\\t').replace('\r', '\\r') 
        
        # if timedelta is 0, then no full datetimes were found in the note,
        if tdelta == 0:
            sample_range = range(2008, 2020)
            tdelta = int(doc_ts - np.random.choice(sample_range, 1)[0])
        
        if chartdate:
            chartdate -= timedelta(days=tdelta * 365)
        if charttime:
            charttime -= timedelta(days=tdelta * 365)
        
        
        if type(row['HADM_ID']) is not str and np.isnan(row['HADM_ID']):
            row['HADM_ID'] = 'NaN'
        else:
            row['HADM_ID'] = int(row['HADM_ID'])
        
        row['SUBJECT_ID'] = int(row['SUBJECT_ID'])
        row['ROW_ID'] = int(row['ROW_ID'])
        
        row['DOC_NAME'] = f"{row['ROW_ID']}_{row['SUBJECT_ID']}_{row['HADM_ID']}"
        row['TEXT'] = text
        row['CHARTDATE'] = str(chartdate.date())
        row['CHARTTIME'] = str(charttime)  if charttime is not None else np.nan  
        docs[row_id] = row


process_clinical_notes(dataset['gold'])
process_clinical_notes(dataset['unlabeled'])


170625
132244
197945
165691
124316
109058
135095
122726
109617
193396
185235
191210
150426
131508
194603
143003
113700
195961
132894
129957
154849
156659
134550
108226
127941
197951
133784
178133
198888
181689
177220
143794
177363
113484
133306
106110
190280
174115
193364
114219
184705
175298
145528
193105
195185
187890
165800
103943
185828
119104
183338
194796
124007
192671
111136
163484
168845
150835
171119
153632
182099
159375
147464
183202
168374
195078
164898
102769
119897
187690
178442
120255
141699
194224
122769
134305
147928
132246
100119
189919
125597
147491
159138
168252
107256
116880
114013
105444
156372
100794
158905
160318
175182
193820
190685
NaN
147501
109874
152610
140041
174850
184517
108622
118481
127190
149585
185578
154165
127208
121907
164330
140590
197946
186185
177737
142052
130806
181231
188864
103309
153726
112928
131973
109679
109679
116532
116532
116532
116532
109679
109679
109679
109679
109679
109679
NaN
109679
116532
116532
109679
109679
109679
109679
10967

In [13]:
def dump_tsvs(dataset, fpath):
    for name in dataset:
        print(name)
        with open(f'{fpath}/{name}.tsv', 'w') as fp:
            for i, row_id in enumerate(dataset[name]):
                row = dataset[name][row_id]
                header = sorted(row.keys())
                
                if i == 0:
                    fp.write('\t'.join(header))
                    fp.write('\n')
                
                values = [str(row[col]) for col in header]
                line = '\t'.join(values)
                fp.write(f'{line}\n')
            
            
outdir = '/Users/fries/Desktop/foobar/'
dump_tsvs(dataset, outdir)

gold
unlabeled
