In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import pandas as pd
import numpy as np
import scipy as sp
import datetime
import matplotlib.pyplot as plt
import math
import re
import csv

# the following are NLTK and WordNet packages for the baseline method:
#from __future__ import unicode_literals
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet as wn
from nltk.compat import python_2_unicode_compatible
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
import nltk

%matplotlib inline

In [3]:
# the following is based on the source code provided for WordNet Lemmatizer (see report)
class WordNetLemmatizer(object):
    def __init__(self):
        pass
    
    def lemmatize(self, word, pos=NOUN):
        lemmas = wn._morphy(word, pos)
        return min(lemmas, key=len) if lemmas else word
    
    def __rept__(self):
        return '<WordNetLemmatizer>'
    
    # unload wordnet
    def teardown_module(module=None):
        from nltk.corpus import wordnet
        wn._unload()

In [4]:
# test wordnet lemmatizer
wnl = WordNetLemmatizer()

In [5]:
# create mappings between Penn treebank and WordNet POS tags:
tag_map = {
        'CC':None, # coordin. conjunction (and, but, or)  
        'CD':wn.NOUN, # cardinal number (one, two)             
        'DT':None, # determiner (a, the)                    
        'EX':wn.ADV, # existential ‘there’ (there)           
        'FW':None, # foreign word (mea culpa)             
        'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
        'JJ':wn.ADJ, # adjective (yellow)                  
        'JJR':wn.ADJ, # adj., comparative (bigger)          
        'JJS':wn.ADJ, # adj., superlative (wildest)           
        'LS':None, # list item marker (1, 2, One)          
        'MD':None, # modal (can, should)                    
        'NN':wn.NOUN, # noun, sing. or mass (llama)          
        'NNS':wn.NOUN, # noun, plural (llamas)                  
        'NNP':wn.NOUN, # proper noun, sing. (IBM)              
        'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
        'PDT':wn.ADJ, # predeterminer (all, both)            
        'POS':None, # possessive ending (’s )               
        'PRP':None, # personal pronoun (I, you, he)     
        'PRP$':None, # possessive pronoun (your, one’s)    
        'RB':wn.ADV, # adverb (quickly, never)            
        'RBR':wn.ADV, # adverb, comparative (faster)        
        'RBS':wn.ADV, # adverb, superlative (fastest)     
        'RP':wn.ADJ, # particle (up, off)
        'SYM':None, # symbol (+,%, &)
        'TO':None, # “to” (to)
        'UH':None, # interjection (ah, oops)
        'VB':wn.VERB, # verb base form (eat)
        'VBD':wn.VERB, # verb past tense (ate)
        'VBG':wn.VERB, # verb gerund (eating)
        'VBN':wn.VERB, # verb past participle (eaten)
        'VBP':wn.VERB, # verb non-3sg pres (eat)
        'VBZ':wn.VERB, # verb 3sg pres (eats)
        'WDT':None, # wh-determiner (which, that)
        'WP':None, # wh-pronoun (what, who)
        'WP$':None, # possessive (wh- whose)
        'WRB':None, # wh-adverb (how, where)
        '$':None, #  dollar sign ($)
        '#':None, # pound sign (#)
        '“':None, # left quote (‘ or “)
        '”':None, # right quote (’ or ”)
        '(':None, # left parenthesis ([, (, {, <)
        ')':None, # right parenthesis (], ), }, >)
        ',':None, # comma (,)
        '.':None, # sentence-final punc (. ! ?)
        ':':None, # miandd-sentence punc (: ; ... – -)
        "''":None  # edge case
    }

In [9]:
# load data
charts = pd.read_csv("NOTEEVENTS.csv", dtype={"ROW_ID":"int64", "CHARTTIME":"str", "STORETIME":"str"})

KeyboardInterrupt: 

In [7]:
#charts
notes_df = charts[['TEXT']]


In [8]:
# preprocessing: for patients 1 through "limit," split notes by token (on newline and space characters) to create a list of lists
# additional preprocessing: remove tokens that contain a digit
limit = 1000
notes = []
for note in notes_df['TEXT'][:limit]:
    notes.append([token for token in re.split(r'[\n ]', note) if token != '' and not bool(re.search(r'\d', token)) and not bool(re.search(r'\W', token))])


In [102]:
unchanged = 0
changed = 0
not_found = set()
for item in notes:
    tagged = nltk.pos_tag(item)
    for tup in tagged:
        # preprocessing: skip all-caps terms (generally abbreviations or acronyms which will not have lemma mappings)
        if tup[0].isupper():
            continue
        # preprocessing: skip all 1-letter tokens (only 'I' and 'A' are possible words, and those will have no lemmas)
        if len(tup[0])==1:
            continue
        orig = tup[0].lower()    # all additional terms, make universally lowercase for more accurate POS tagging
        if tag_map[tup[1]] is None:
            # skip words that don't have a POS tag in WordNet(conjunctions, etc.)
            continue
        # preprocessing: skip prepositions ("of", "with", "without", etc.)
        if tup[1] == "IN":
            continue
        if wnl.lemmatize(orig, pos=tag_map[tup[1]])==orig:
            unchanged+=1
            #print('{:<15}:\t'.format(orig), tup[1])
            if not wn.synsets(orig):
                temp = (orig, tag_map[tup[1]])
                not_found.add(temp)
        else:
            changed+=1
            #print('{:<15}-->\t\t'.format(orig), wnl.lemmatize(orig, pos=tag_map[tup[1]]))

print('Unchanged: '+str(unchanged)+'\nChanged: '+str(changed))
print('Total number of tokens (after processing): '+str(unchanged+changed))
print('Number of (unique) tokens not found by WNL: '+str(len(not_found)))


Unchanged: 510266
Changed: 164968
Total number of tokens (after processing): 675234
Number of (unique) tokens not found by WNL: 7496


In [103]:
# write list of words not found in WordNet to csv file
with open('../undetermined_lemmas.txt', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(not_found)