In [5]:
import re
import os
import xmltodict

import spacy
import scispacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [6]:
# Load Base English model
from spacy.lang.en import English
en = English()

In [8]:
# ScispaCy model
# To install, please do:
# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz

import en_core_sci_md
scispacy = en_core_sci_md.load()

Load data

In [9]:
def open_ehr(path):
    with open(path, 'r') as file:
        doc=file.read()
    doc_main = xmltodict.parse(doc)['PatientMatching']
    doc_txt = doc_main['TEXT']
    doc_tags = doc_main['TAGS']
    return doc_txt, doc_tags

In [10]:
def create_tags_dict(xml_tags):
    tag_dict = {}
    for name in xml_tags:
        tag_dict[name] = xml_tags[name]["@met"]
    return tag_dict

In [11]:
def clean_txt(xml_txt):
    new_txt=re.sub("\\n", " ", xml_txt)
    new_txt = re.sub("[\s\W]{4,}|\_+", " ", new_txt)
    return new_txt

In [12]:
train_records={}

directory = '../train/'

for filename in os.listdir(directory):
    if filename.endswith('.xml'):
        file_id = int(filename.split('.')[0])
        xml_txt, xml_tags = open_ehr(os.path.join(directory, filename))
        tags_dict = create_tags_dict(xml_tags)
        text = clean_txt(xml_txt)
        train_records[file_id] = {'tags':tags_dict,
                                   'text':text}


In [13]:
len(train_records)

202

Tokenizer Function

In [14]:
# Advanced Tokenizer

def adv_tokenizer(doc, model=en, 
                  replace_entities=False, 
                  remove_stopwords=True, 
                  lowercase=True, 
                  alpha_only=True, 
                  lemma=True):
    """Full tokenizer with flags for processing steps
    replace_entities: If True, replaces with entity type
    stop_words: If False, removes stop words
    lowercase: If True, lowercases all tokens
    alpha_only: If True, removes all non-alpha characters
    lemma: If True, lemmatizes words
    """
    parsed = model(doc)
    # token collector
    tokens = []
    # index pointer
    i = 0
    # entity collector
    ent = ''
    for t in parsed:
        # only need this if we're replacing entities
        if replace_entities:
            # replace URLs
            if t.like_url:
                tokens.append('URL')
                continue
            # if there's entities collected and current token is non-entity
            if (t.ent_iob_=='O')&(ent!=''):
                tokens.append(ent)
                ent = ''
                continue
            elif t.ent_iob_!='O':
                ent = t.ent_type_
                continue
        # only include stop words if stop words==True
        if (t.is_stop) & (remove_stopwords):
            continue
        # only include non-alpha is alpha_only==False
        if (not t.is_alpha)&(alpha_only):
            continue
        if lemma:
            t = t.lemma_
        else:
            t = t.text
        if lowercase:
            t.lower() 
        tokens.append(t)   
    return tokens

In [15]:
# Function to return the top N tokens using Count Vectorizer
def top_N_words(text, N=10, model=en, replace_entities=False, remove_stopwords=True, lowercase=True, 
                alpha_only=True, lemma=True, min_df=1, max_df=1.0, ngram=1):
    
  # running this on negative reviews
  cv = CountVectorizer(tokenizer=lambda text: adv_tokenizer(text, model=model, 
                                                            lemma=lemma, 
                                                            replace_entities=replace_entities,
                                                            lowercase=lowercase, 
                                                            remove_stopwords=remove_stopwords,
                                                            alpha_only=alpha_only), 
                       min_df=min_df, max_df=max_df, ngram_range=(ngram,ngram))
  
  cv_vectors = cv.fit_transform(text).toarray()
  # get_feature_names gets the vocabulary of the vectorizer in order
  word_count = dict(zip(cv.get_feature_names(), cv_vectors.sum(axis=0)))
  # get the top N words
  return sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:N]

## Finding Token Counts

In [68]:
# Top N words in one document
text = train_records[101]['text']

print("Top N words present:")
print(top_N_words([text], N=10, ngram=1), '\n')

Top N words present:
[('weight', 11), ('chest', 9), ('russell', 9), ('clear', 6), ('mercy', 6), ('pressure', 6), ('donna', 5), ('follow', 5), ('lose', 5), ('assessment', 4)] 



In [69]:
# Top N words in a list of documents
text_list = []
for record in train_records.values():
    text_list.append(record['text'])

print("Top N words present:")
print(top_N_words(text_list, N=10, ngram=1), '\n')

Top N words present:
[('mg', 6047), ('po', 3019), ('patient', 2707), ('history', 2553), ('qd', 2228), ('pain', 2139), ('date', 1591), ('chest', 1507), ('l', 1478), ('normal', 1439)] 



In [70]:
# Top N words in a list of documents
text_list = []
for record in train_records.values():
    text_list.append(record['text'])

print("Top N words present:")
print(top_N_words(text_list, N=10, ngram=1, replace_entities=True), '\n')

Top N words present:
[('URL', 66)] 



### Find Top Tokens for Conditions Met

In [36]:
text_by_tag = {}
for tag in train_records[162]['tags']:
    text_by_tag[tag] = []

In [38]:
# Create dictionary where key = tag; value = array of texts that meet tag criteria
for record in train_records.values():
    for tag in text_by_tag:
        if record['tags'][tag] == 'met':
            text_by_tag[tag].append(record['text'])

In [49]:
top_tokens = {}
for tag in text_by_tag:
    top_tokens[tag] = top_N_words(text_by_tag[tag], N=10, ngram=1)

Top 10 words in text with criteria for ABDOMINAL:
[('mg', 5306), ('po', 2758), ('history', 2202), ('patient', 2154), ('qd', 2048), ('pain', 1914), ('l', 1480), ('date', 1450), ('normal', 1260), ('left', 1152)] 

Top 10 words in text with criteria for ADVANCED-CAD:
[('mg', 7522), ('po', 3534), ('patient', 3234), ('history', 3186), ('pain', 2910), ('qd', 2872), ('chest', 2082), ('date', 1874), ('normal', 1794), ('l', 1758)] 

Top 10 words in text with criteria for ALCOHOL-ABUSE:
[('mg', 294), ('history', 224), ('pain', 186), ('patient', 186), ('po', 166), ('l', 148), ('normal', 146), ('dr', 112), ('qd', 112), ('blood', 94)] 

Top 10 words in text with criteria for ASP-FOR-MI:
[('mg', 10306), ('po', 5048), ('patient', 4262), ('history', 4106), ('qd', 3910), ('pain', 3634), ('chest', 2548), ('date', 2490), ('l', 2422), ('normal', 2358)] 

Top 10 words in text with criteria for CREATININE:
[('mg', 4756), ('patient', 2422), ('po', 2406), ('history', 2220), ('qd', 1690), ('pain', 1528), ('dat

In [None]:
# TODO: 
##  add frequent document words to stop words
##  histograms for all conditions
## 