## Writeup
100% LVEF extraction accuracy against human labeled data on a random sample of 100 echo notes using several rules:
1. preprocess and tokenize text
1. search for matches of ['LVEF', 'EF', 'ejection fraction'] in the cleaned tokens
1. for each match, take the two tokens preceeding the match and the three words proceeeding the match
1. return the first token containing a numeric character

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import RegexpTokenizer

In [2]:
tokenizer = RegexpTokenizer(r'\w+')
value_descriptions = ['LVEF', 'EF', 'ejection fraction']

In [3]:
df = pd.read_csv('mimiciii_noteevents_random_100_annotated.csv')
df.ef_value.replace('None', np.NaN, inplace=True)

In [4]:
def preprocess(text):
    """Preprocess text for tokenization"""
    # standardize symbols
    text_to_symbols = {'percent': '%', 'equals': '='}
    for symbol_word, symbol in text_to_symbols.items():
        text = text.replace(symbol_word, symbol)

    # add whitespace before junk tokens to make sure they aren't included with extracted number
    equality_signs = ['<', '>', '=']
    for s in equality_signs:
        text = text.replace(s, ' ' + s)

    # add whitespace before junk tokens to make sure they aren't included with extracted number
#     junk = [',', '.']
    junk = [',']
    for s in junk:
        text = text.replace(s, ' ' + s)

    # keep % sign with preceeding number
    precede_pct = [' ', '\n']
    for s in precede_pct:
        text = text.replace(s + '%', '%')

    # remove parentheses
    remove_strs = ['(', ')', '[', ']', '{', '}']
    for s in remove_strs:
        text = text.replace(s, '')

    return text

def endswithin(s, strs):
    for x in strs:
        if s.endswith(x):
            return True
    return False

def get_aoi(tokens, value_descriptions):
    """Get areas of interest (lists of tokens) for descripions of tests `value_descriptions`."""
    aois = []
    for i, token in enumerate(tokens):
        for value_description in value_descriptions:
            vd_tokens = tokenizer.tokenize(value_description)
            if all(vd_token == tokens[i + j] for j, vd_token in enumerate(vd_tokens)):
                aois.append(tokens[i - 1: i + len(vd_tokens) + 2])
    return aois

def contains_num(s):
    """Return True if string `s` contains a number"""
    for c in s:
        if c.isnumeric():
            return True
    else:
        return False

def aois_to_value(aois):
    """Extract the numeric value from aoi list (list of tokens) `aois`."""
    for aoi in aois:
        for token in aoi:
            # return the first token with a numeric character in it
            if contains_num(token):
                return token
    return np.NaN

def preprocess_token(text):
    # remove trailing punctuation
    tail_junk = ['.', ':', '=']
    while endswithin(text, tail_junk):
        text = text[:len(text) - 1]
    return text

def text_to_value(text):
    """Extract the numeric value from `text`."""
    text = preprocess(text)
    tokens = [preprocess_token(token) for token in text.split()]
    aois = get_aoi(tokens, value_descriptions)
    return aois_to_value(aois)

In [5]:
text_to_value('low normal (LVEF 50-55%). The')

'50-55%'

In [6]:
text_to_value('LVEF: 58.7 %  ')

'58.7%'

In [7]:
# summarize accuracy (('Mimatched labels)
df['extracted_lvef'] = df.text.map(text_to_value)
df['extraction_matches_label'] = df.ef_value.fillna('') == df.extracted_lvef.fillna('')
print('{:.0f}% of extracted values match human labels'.format(
    df.extraction_matches_label.value_counts(normalize=True)[True] * 100))

100% of extracted values match human labels


In [8]:
# identify mismatches
mismatches = df[['ef_value', 'extracted_lvef', 'text', 'row_id']][~df.extraction_matches_label]
print('Mimatched labels (extracted vs human label)')
for idx, mismatch in mismatches.iterrows():
    print('[{}] extracted value {}, human label {}'
          '\n-----------------------------------------\n\n{}'
          '\n\n----------------------------------------\n'.format(
        mismatch.row_id, mismatch.extracted_lvef, mismatch.ef_value, mismatch.text))

Mimatched labels (extracted vs human label)


### RWJ sample

In [9]:
with open('nbi_xcel_res_deident.txt', 'r') as f:
    rwj = f.readlines()

In [10]:
cells = [line.split('|') for line in rwj[3:]]
txt_lines = [line[5] for line in cells if len(line) > 5]
txt_lines = (txt_lines[:79], txt_lines[79:])
txts = ['\n'.join(t) for t in txt_lines]

In [11]:
for t in txts:
    print('------------------------------------\nExtracted EF value: {}'.format(text_to_value(t)))
    print(t)

------------------------------------
Extracted EF value: 56.5%

                                             Newark Beth Israel
                                               Medical Center
                                            Cardiac Non-Invasive
                                                 Laboratory
                                               201 Lyons Ave
                                             Newark, New Jersey
                                                   07112
                                          Phone: (973) 926-7476
                    Transthoracic Echocardiogram Report
____________________________________________________________________________

Name: XERO, XENA C                Study Date: 03/20/2018 09:07 AM
MRN: 763234                         Account #: 654321888
DOB: 06/27/1950     Age: 67 yrs     Patient Status: Outpatient
Gender: Female                      Referring Physician: Duck, Don, M.D.
Height: 66 in       Weight: 161 lb  BSA: 1.8 m