# mapping custom tags 

In [2]:
import pandas as pd
import numpy as np

In [3]:
import plac
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token


In [4]:
def load_drug_entities():
    drugs = np.load("/Users/sdeshpande/Desktop/bioinformatices/Clinical_NER_train/drug_entities.npy")
    drugs = [d.lower() for d in drugs if (len(d)>4)]
    non_ents = ["solution"]
    for ent in non_ents:
        drugs.remove(ent)
    drugs = list(set(drugs))
    return drugs

In [5]:
drugs = load_drug_entities()

In [6]:
drugs[0:20]

['ferrous sulfate',
 'cosyntropin',
 'cipro hc otic suspension',
 'flovent',
 'fluphenazine decanoate',
 'cinacalcet',
 'golytely',
 'critic-aid clear',
 '*nf* praziquantel',
 'buprenorphine-naloxone (8mg-2mg)',
 'jolessa',
 'propofol',
 'phenobarbital',
 '*nf* interferon beta-1a',
 'insulin lispro 75/25',
 'methylprednisolone na succ',
 'neomycin-polymyxin-',
 '*nf* glycopyrrolate',
 'terconazole 80mg vag. supp',
 'maribavir 100mg or placebo (*ind*)']

In [7]:
def load_drug_entities():
    drugs = np.load("/Users/sdeshpande/Desktop/bioinformatices/Clinical_NER_train/drug_entities.npy")
    drugs = [d.lower() for d in drugs if (len(d)>4)]
    non_ents = ["solution"]
    for ent in non_ents:
        drugs.remove(ent)
    drugs = list(set(drugs))
    return drugs

In [8]:
def load_dose_entities():
    doses = np.load("/Users/sdeshpande/Desktop/bioinformatices/Clinical_NER_train/dose_entities.npy")
    doses = [d.lower() for d in doses if len(d)>1]
    return list(set(doses))

In [9]:
dose_ents = load_dose_entities()
print(dose_ents[:10])

['5ml oral syr', '15 mg/5 ml oral soln', '120mg suppository', '1 mg vial', '2 g / 100 ml premix bag', '10 meq (1080 mg) tablet', '20 mg/ml oral concentrate', '100mg/10ml vial', '150-30-6-150 mg-unit-mg-mg', '1mg oral syr.']


In [10]:
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [11]:
import spacy

In [12]:
nlp = spacy.load('en_core_web_md')

In [13]:
class EntityMatcher(object):
    name = 'entity_matcher'

    def __init__(self, nlp, drugs, label):
        patterns = [nlp(text) for text in drugs]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        seen_tokens = set()
        new_entities = []
        entities = doc.ents
        for match_id, start, end in matches:
        #    span = Span(doc, start, end, label=match_id)
        #    doc.ents = list(doc.ents) + [span]
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                new_entities.append(Span(doc, start, end, label=match_id))
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))

        doc.ents = tuple(entities) + tuple(new_entities)
        return doc

In [14]:
class EntityMatcherDose(object):
    name = 'entity_matcher_dose'

    def __init__(self, nlp, dose_ents, label):
        patterns = [nlp(text) for text in dose_ents]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        seen_tokens = set()
        new_entities = []
        entities = doc.ents
        for match_id, start, end in matches:
        #    span = Span(doc, start, end, label=match_id)
        #    doc.ents = list(doc.ents) + [span]
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                new_entities.append(Span(doc, start, end, label=match_id))
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))

        doc.ents = tuple(entities) + tuple(new_entities)
        return doc

In [15]:
from spacy.matcher import Matcher

In [16]:
entity_matcher_3 = Matcher(nlp.vocab)
pattern = [{"TEXT": {"REGEX": "[0-9]+"}}]
entity_matcher_3.add("MEASUREMENTS", None, pattern)

In [17]:
class EntityMatcherNumber(object):
    name = 'entity_matcher_number'

    def __init__(self, nlp, label):
        patterns = [{"TEXT": {"REGEX": "[0-9]+"}}]
        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(label, None, patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        seen_tokens = set()
        new_entities = []
        entities = doc.ents
        for match_id, start, end in matches:
        #    span = Span(doc, start, end, label=match_id)
        #    doc.ents = list(doc.ents) + [span]
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                new_entities.append(Span(doc, start, end, label=match_id))
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))

        doc.ents = tuple(entities) + tuple(new_entities)
        return doc

In [18]:
entity_matcher_3 = EntityMatcherNumber(nlp, 'MEASUREMENTS')

In [19]:
### Regex pattern matching reference code for spacy

# nlp = spacy.load("en_core_web_lg")
# matcher = Matcher(nlp.vocab)
# pattern = [{"TEXT": {"REGEX": "^[Uu](\.?|nited)$"}},
#            {"TEXT": {"REGEX": "^[Ss](\.?|tates)$"}}]
# matcher.add("US", None, pattern)
# doc = nlp(u"I'm from the United States.")
# matches = matcher(doc)
# for match_id, start, end in matches:
#     string_id = nlp.vocab.strings[match_id]
#     span = doc[start:end]
#     print(match_id, string_id, start, end, span.text)

In [20]:
entity_matcher = EntityMatcher(nlp, drugs, 'DRUGS')
entity_matcher_2 = EntityMatcherDose(nlp, dose_ents, 'DOSES')

In [21]:
nlp.add_pipe(entity_matcher)
nlp.add_pipe(entity_matcher_2)
print(nlp.pipe_names)  # the components in the pipeline

['tagger', 'parser', 'ner', 'entity_matcher', 'entity_matcher_dose']


In [22]:
nlp.add_pipe(entity_matcher_3)
print(nlp.pipe_names)

['tagger', 'parser', 'ner', 'entity_matcher', 'entity_matcher_dose', 'entity_matcher_number']


In [29]:
doc = nlp(u"guanfacine, capecitabine, ertapenem sodium, insulin lispro, isosorbide mononitrate , nexium, aprotinin, metronidazole (flagyl), critic-aid, 0.4mg/2ml vial, 0.4mg/hr patch (10mg), 50 mg capsule. I have a cat")
print([(ent.text, ent.label_) for ent in doc.ents])

[('guanfacine', 'DRUGS'), ('ertapenem', 'DRUGS'), ('sodium', 'DRUGS'), ('insulin', 'DRUGS'), ('isosorbide', 'DRUGS'), ('nexium', 'DRUGS'), ('aprotinin', 'DRUGS'), ('metronidazole', 'DRUGS'), ('critic-aid', 'DRUGS'), ('0.4mg/2ml', 'MEASUREMENTS'), ('0.4mg', 'MEASUREMENTS'), ('10', 'MEASUREMENTS'), ('50', 'MEASUREMENTS'), ('mg', 'DOSES'), ('capsule', 'DOSES')]


In [30]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

guanfacine 0 10 DRUGS
ertapenem 26 35 DRUGS
sodium 36 42 DRUGS
insulin 44 51 DRUGS
isosorbide 60 70 DRUGS
nexium 85 91 DRUGS
aprotinin 93 102 DRUGS
metronidazole 104 117 DRUGS
critic-aid 128 138 DRUGS
0.4mg/2ml 140 149 MEASUREMENTS
0.4mg 156 161 MEASUREMENTS
10 172 174 MEASUREMENTS
50 179 181 MEASUREMENTS
mg 182 184 DOSES
capsule 185 192 DOSES


In [36]:
df = pd.DataFrame([(e.text, e.ent_iob_, e.ent_type_) for e in doc])
df.columns = ["word","BIO","tag"]
df.head()

Unnamed: 0,word,BIO,tag
0,guanfacine,B,DRUGS
1,",",O,
2,capecitabine,O,
3,",",O,
4,ertapenem,B,DRUGS


In [37]:
sentences = []
for sent_i, sent in enumerate(doc.sents):
    for token in sent:
        sentences.append((sent_i, token.text))
        
df2 = pd.DataFrame(sentences)
df2.columns = ["sentence_number","word"]
df2.head()

Unnamed: 0,sentence_number,word
0,0,guanfacine
1,0,","
2,0,capecitabine
3,0,","
4,0,ertapenem


In [40]:
merged_df = pd.merge(df, df2, on='word')
print(merged_df)

           word BIO    tag  sentence_number
0    guanfacine   B  DRUGS                0
1             ,   O                       0
2             ,   O                       0
3             ,   O                       0
4             ,   O                       0
..          ...  ..    ...              ...
157           .   O                       0
158           I   O                       1
159        have   O                       1
160           a   O                       1
161         cat   O                       1

[162 rows x 4 columns]
