In [7]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import pandas as pd
import numpy as np
import scipy as sp
import datetime
import matplotlib.pyplot as plt
import math
import re
import csv
import pickle

# the following are NLTK and WordNet packages for the baseline method:
#from __future__ import unicode_literals
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet as wn
from nltk.compat import python_2_unicode_compatible
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
import nltk

%matplotlib inline

In [2]:
# the following is based on the source code provided for WordNet Lemmatizer (see report)
class WordNetLemmatizer(object):
    def __init__(self):
        pass
    
    def lemmatize(self, word, pos=NOUN):
        lemmas = wn._morphy(word, pos)
        return min(lemmas, key=len) if lemmas else word
    
    def __rept__(self):
        return '<WordNetLemmatizer>'
    
    # unload wordnet
    def teardown_module(module=None):
        from nltk.corpus import wordnet
        wn._unload()

In [3]:
# test wordnet lemmatizer
wnl = WordNetLemmatizer()

In [4]:
# create mappings between Penn treebank and WordNet POS tags:
tag_map = {
        'CC':None, # coordin. conjunction (and, but, or)  
        'CD':wn.NOUN, # cardinal number (one, two)             
        'DT':None, # determiner (a, the)                    
        'EX':wn.ADV, # existential ‘there’ (there)           
        'FW':None, # foreign word (mea culpa)             
        'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
        'JJ':wn.ADJ, # adjective (yellow)                  
        'JJR':wn.ADJ, # adj., comparative (bigger)          
        'JJS':wn.ADJ, # adj., superlative (wildest)           
        'LS':None, # list item marker (1, 2, One)          
        'MD':None, # modal (can, should)                    
        'NN':wn.NOUN, # noun, sing. or mass (llama)          
        'NNS':wn.NOUN, # noun, plural (llamas)                  
        'NNP':wn.NOUN, # proper noun, sing. (IBM)              
        'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
        'PDT':wn.ADJ, # predeterminer (all, both)            
        'POS':None, # possessive ending (’s )               
        'PRP':None, # personal pronoun (I, you, he)     
        'PRP$':None, # possessive pronoun (your, one’s)    
        'RB':wn.ADV, # adverb (quickly, never)            
        'RBR':wn.ADV, # adverb, comparative (faster)        
        'RBS':wn.ADV, # adverb, superlative (fastest)     
        'RP':wn.ADJ, # particle (up, off)
        'SYM':None, # symbol (+,%, &)
        'TO':None, # “to” (to)
        'UH':None, # interjection (ah, oops)
        'VB':wn.VERB, # verb base form (eat)
        'VBD':wn.VERB, # verb past tense (ate)
        'VBG':wn.VERB, # verb gerund (eating)
        'VBN':wn.VERB, # verb past participle (eaten)
        'VBP':wn.VERB, # verb non-3sg pres (eat)
        'VBZ':wn.VERB, # verb 3sg pres (eats)
        'WDT':None, # wh-determiner (which, that)
        'WP':None, # wh-pronoun (what, who)
        'WP$':None, # possessive (wh- whose)
        'WRB':None, # wh-adverb (how, where)
        '$':None, #  dollar sign ($)
        '#':None, # pound sign (#)
        '“':None, # left quote (‘ or “)
        '”':None, # right quote (’ or ”)
        '(':None, # left parenthesis ([, (, {, <)
        ')':None, # right parenthesis (], ), }, >)
        ',':None, # comma (,)
        '.':None, # sentence-final punc (. ! ?)
        ':':None, # miandd-sentence punc (: ; ... – -)
        "''":None  # edge case
    }

In [57]:
# load data
charts = pd.read_csv("NOTEEVENTS.csv", dtype={"ROW_ID":"int64", "CHARTTIME":"str", "STORETIME":"str"})
notes_df = charts[['TEXT']]

#### Complete initial preprocessing and append tokens to list

In [86]:
# preprocessing: for patients 1 through "limit," split notes by token (on newline and space characters) to create a list of lists
# additional preprocessing: remove tokens that contain a digit
limit = 100
base = 1000
notes = []
for note in notes_df['TEXT'][base:base+limit]:
    notes.append([token for token in re.split(r'[\n ]', note) if token != '' and not bool(re.search(r'\d', token)) and not bool(re.search(r'\W', token))])

#### load and create the model generated by `train.ipynb`

In [87]:
with open("model.txt", "rb") as in_file:
    model = pickle.load(in_file)
rules = []
for item in model:
    temp = item[0].split('/')
    temp.append(item[1][0])
    rules.append(temp)

In [88]:
#rules

#### Evaluate model performance by tracking different counts (used later to calculate performance metrics - lemmatized percentage, accuracy of lemmatizer, etc.)

In [89]:
wn_unchanged = 0
wn_changed = 0
wn_unfound = 0
my_changed = 0
my_unchanged = 0
lemmatized_list = []

for item in notes:
    tagged = nltk.pos_tag(item)
    for tup in tagged:
        # preprocessing: skip all-caps terms, one-char tokens, terms w/o WordNet POS tag, and prepositions ("of", "with", ...)
        if len(tup[0])==1 or tup[0].isupper() or tag_map[tup[1]] is None or tup[1] == "IN":
            continue
        orig = tup[0].lower()    # all add'n terms, make universally lowercase
        pos = tag_map[tup[1]]
        # differentiate b/t words FOUND and NOT FOUND by WordNet lemmatization tool:
        if wn.synsets(orig):
            if wnl.lemmatize(orig, pos=pos)==orig:
                wn_unchanged += 1
            else:
                wn_changed += 1
        else:
            wn_unfound += 1
            # since "model" is sorted in "train.ipynb", rules should be checked in their given order
            found_rule = False
            for rule in rules:
                if orig.endswith(rule[0]) and pos==rule[1]:
                    lem = orig[:-len(rule[0])]+rule[2]
                    #print('lemmatizing '+orig+' to: '+lem)
                    my_changed += 1
                    lemmatized_list.append([orig, pos, lem])
                    found_rule = True
                    break   # to prevent trying to lemmatize an already-lemmatized token
            if not found_rule:
                #print('no rule to lemmatize '+orig)
                my_unchanged += 1
print(my_changed, my_unchanged)

1734 2048


In [90]:
#lemmatized_list

#### check if lemmatized words were lemmatized correctly by comparing to "gold standard" (created by hand using training data from `generate.ipynb`)

In [91]:
# first, open file and format lemma mappings:
with open("lemma_mappings.txt") as file:
    items = file.readlines()
items = [item.lstrip('\t').rstrip(',\n').split(':') for item in items if len(item) > 1]

lms = {}
for item in items:
    temp = []
    item[0] = item[0].replace('\'','').lstrip('(').rstrip(')').split(', ')
    item[1] = item[1].replace('\'','')
    lms[item[0][0]+'/'+item[0][1]] = item[1]
  

In [92]:
# next, check if lemma mapping exists in gold standard. if so, check for accuracy
for item in lemmatized_list:
    if item[0]+'/'+item[1] in lms:
        print('found '+item[0]+'/'+item[1])
    

found intracardiac/a
found intracardiac/a
found reformatted/v
found drainable/a
found drainable/a
found neosinephrine/n
found infrarenal/a
found reformatted/v
found arousable/a
found reaccumulation/n
found reaccumulation/n
found tacrolimus/n
found enterobacter/n
found arousable/a
found arousable/a
found drainable/a
found reaccumulation/n
found arousable/a
