In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

from spacy.cli.train import train
import spacy

In [10]:
train("config/config_bert.cfg",
      output_path='sig_ner_bert_model',
      overrides={"paths.train": "/Users/royashcenazi/Downloads/train_docs.spacy", 
                 "paths.dev": "/Users/royashcenazi/Downloads/test_docs.spacy",
                 "training.max_epochs": 72})

[38;5;4mℹ Saving to output directory: sig_ner_bert_model[0m
[38;5;4mℹ Using CPU[0m
[1m


ValueError: [E002] Can't find factory for 'transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, ner, beam_ner, entity_ruler, tagger, morphologizer, senter, sentencizer, textcat, spancat, future_entity_ruler, span_ruler, textcat_multilabel, en.lemmatizer

In [11]:
from spacy.lang.en import English
# minimal example - initialize English model, add in our BioClinicalBERT
en = English()
# using a custom config - uses BioClinicalBERT
# this is Tok2VecTransformer, which combines Transformer+Listener, we'll use something different in training
config = {
    "model": {
        "@architectures": "spacy-transformers.Tok2VecTransformer.v3",
        "name": "emilyalsentzer/Bio_ClinicalBERT",
        "tokenizer_config": {"use_fast": True},
        # these have to do with alignment
        'get_spans': {'@span_getters': 'spacy-transformers.strided_spans.v1',
          'stride': 96,
          'window': 128},
        "pooling": {"@layers":"reduce_mean.v1"} 
    }
}
trf = en.add_pipe("tok2vec", config=config)
# need to initialize pipeline components
en.initialize()
# two different contexts
ex1 = 'Flintstones vitamins'
ex2 = 'Flintstones cartoon'

RegistryError: [E893] Could not find function 'spacy-transformers.Tok2VecTransformer.v3' in function registry 'architectures'. If you're using a custom function, make sure the code is available. If the function is provided by a third-party package, e.g. spacy-transformers, make sure the package is installed in your environment.

Available names: spacy-legacy.CharacterEmbed.v1, spacy-legacy.EntityLinker.v1, spacy-legacy.HashEmbedCNN.v1, spacy-legacy.MaxoutWindowEncoder.v1, spacy-legacy.MishWindowEncoder.v1, spacy-legacy.MultiHashEmbed.v1, spacy-legacy.Tagger.v1, spacy-legacy.TextCatBOW.v1, spacy-legacy.TextCatCNN.v1, spacy-legacy.TextCatEnsemble.v1, spacy-legacy.Tok2Vec.v1, spacy-legacy.TransitionBasedParser.v1, spacy.CharacterEmbed.v2, spacy.EntityLinker.v2, spacy.HashEmbedCNN.v2, spacy.MaxoutWindowEncoder.v2, spacy.MishWindowEncoder.v2, spacy.MultiHashEmbed.v2, spacy.PretrainCharacters.v1, spacy.PretrainVectors.v1, spacy.SpanCategorizer.v1, spacy.Tagger.v2, spacy.TextCatBOW.v2, spacy.TextCatCNN.v2, spacy.TextCatEnsemble.v2, spacy.TextCatLowData.v1, spacy.Tok2Vec.v2, spacy.Tok2VecListener.v1, spacy.TorchBiLSTMEncoder.v1, spacy.TransitionBasedParser.v2

In [8]:
trained_nlp = spacy.load('./sig_ner_model/model-best')

##### first outcome! 

In [9]:
inp = 'TAKE 1 TABLET  (150 MCG TOTAL) BY MOUTH two times DAILY for one week by the way patient id is 303023 and take 2 aderol every day'.lower()
[(e, e.label_) for e in trained_nlp(inp).ents]

[(1, 'Dosage'),
 (tablet, 'Form'),
 (150 mcg, 'Strength'),
 (two times daily, 'Frequency'),
 (aderol, 'Drug'),
 (every day, 'Frequency')]

In [151]:
inp = 'inhale 3 puffs of albuterol 2 times a day for one week patient id is 20202'
[(e, e.label_) for e in trained_nlp(inp).ents]

[(3, 'Dosage'), (puffs, 'Form'), (albuterol, 'Drug'), (2 times a day, 'Frequency'), (for one week, 'Duration')]


 We can even parse multuple dosing instructions

In [171]:
inp = 'inhale 3 puffs of albuterol 2 times a day for one week' \
      ' then 1 puff of albuterol every day for 2 months'

print([(e, e.label_) for e in trained_nlp(inp).ents])

[(3, 'Dosage'), (puffs, 'Form'), (albuterol, 'Drug'), (2 times a day, 'Frequency'), (for one week, 'Duration'), (1, 'Dosage'), (puff, 'Form'), (albuterol, 'Drug'), (every day, 'Frequency'), (for 2 months, 'Duration')]


In [10]:
inp = 'TAKE 1 TABLET (20 MG) BY MOUTH every 2 months'.lower()
print([(e, e.label_) for e in trained_nlp(inp).ents])

[(1, 'Dosage'), (tablet, 'Form'), (20 mg, 'Strength'), (every 2 months, 'Frequency')]


 There are many things to improve, one example is the frequency that can and should be parsed to an interval type and amoubt (e.g type=day, amount=2)

Also, the model only recognizes encountered features, so inputting to the model a new sentence, will not be parsed correctly. 
When we will use the pre-trained Bert model, this issue should be resolved

In [11]:
inp = 'Take 1 tablet of ibuprofen 3 times a day'


[(e, e.label_) for e in trained_nlp(inp).ents]

[(1, 'Dosage'),
 (tablet, 'Form'),
 (ibuprofen, 'Drug'),
 (3 times a day, 'Dosage')]

In [179]:
inp = ' ASPIRIN 100 mg Initiate Medication IMMEDIATE RELEASE TABLET 1.5 daily'.lower()

[(e, e.label_) for e in trained_nlp(inp).ents]

[(aspirin, 'Drug'),
 (100 mg, 'Strength'),
 (tablet, 'Form'),
 (1.5, 'Dosage'),
 (daily, 'Frequency')]