In [4]:
import spacy

In [5]:
nlp = spacy.load("en_core_sci_sm")

In [6]:
from flair.datasets import NCBI_DISEASE

corpus = NCBI_DISEASE()

2021-02-17 06:54:26,247 Reading data from /root/.flair/datasets/ncbi_disease
2021-02-17 06:54:26,247 Train: /root/.flair/datasets/ncbi_disease/SciSpacySentenceSplitter_core_sci_sm_0.4.0_SciSpacyTokenizer_core_sci_sm_0.4.0_train.conll
2021-02-17 06:54:26,248 Dev: /root/.flair/datasets/ncbi_disease/SciSpacySentenceSplitter_core_sci_sm_0.4.0_SciSpacyTokenizer_core_sci_sm_0.4.0_dev.conll
2021-02-17 06:54:26,248 Test: /root/.flair/datasets/ncbi_disease/SciSpacySentenceSplitter_core_sci_sm_0.4.0_SciSpacyTokenizer_core_sci_sm_0.4.0_test.conll


In [7]:
# 2. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")

In [8]:
# 3. initialize embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings

embedding_types = [

    # word embeddings trained on PubMed and PMC
    WordEmbeddings("pubmed"),

    # flair embeddings trained on PubMed and PMC
    FlairEmbeddings("pubmed-forward"),
    FlairEmbeddings("pubmed-backward"),
]


embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

2021-02-17 06:55:04,642 https://flair.informatik.hu-berlin.de/resources/embeddings/token/pubmed_pmc_wiki_sg_1M.gensim.vectors.npy not found in cache, downloading to /tmp/tmpv66t2bpk


100%|██████████| 800000128/800000128 [01:12<00:00, 10980980.60B/s]

2021-02-17 06:56:17,646 copying /tmp/tmpv66t2bpk to cache at /root/.flair/embeddings/pubmed_pmc_wiki_sg_1M.gensim.vectors.npy





2021-02-17 06:56:18,068 removing temp file /tmp/tmpv66t2bpk
2021-02-17 06:56:18,246 https://flair.informatik.hu-berlin.de/resources/embeddings/token/pubmed_pmc_wiki_sg_1M.gensim not found in cache, downloading to /tmp/tmpdenw2jgi


100%|██████████| 53979687/53979687 [00:05<00:00, 9300247.57B/s] 

2021-02-17 06:56:24,202 copying /tmp/tmpdenw2jgi to cache at /root/.flair/embeddings/pubmed_pmc_wiki_sg_1M.gensim
2021-02-17 06:56:24,237 removing temp file /tmp/tmpdenw2jgi





2021-02-17 06:56:27,152 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/pubmed-forward.pt not found in cache, downloading to /tmp/tmpl5iq7m5h


100%|██████████| 72819144/72819144 [00:09<00:00, 7354756.10B/s] 

2021-02-17 06:56:37,241 copying /tmp/tmpl5iq7m5h to cache at /root/.flair/embeddings/pubmed-forward.pt
2021-02-17 06:56:37,284 removing temp file /tmp/tmpl5iq7m5h





2021-02-17 06:56:37,567 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/pubmed-backward.pt not found in cache, downloading to /tmp/tmpe8_tubgt


100%|██████████| 72819144/72819144 [00:04<00:00, 15899180.53B/s]

2021-02-17 06:56:42,276 copying /tmp/tmpe8_tubgt to cache at /root/.flair/embeddings/pubmed-backward.pt
2021-02-17 06:56:42,320 removing temp file /tmp/tmpe8_tubgt





In [9]:
# 4. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type="ner",
    use_crf=True,
    locked_dropout=0.5
)

In [10]:
# 5. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train(
    base_path="taggers/ncbi-disease",
    train_with_dev=False,
    max_epochs=200,
    learning_rate=0.1,
    mini_batch_size=32
)

2021-02-17 06:57:43,542 ----------------------------------------------------------------------------------------------------
2021-02-17 06:57:43,543 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('pubmed')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4296, out_features=4296, bias=True)
  (rnn): LSTM(4296, 256, batch_first=True, bi

{'test_score': 0.8695652173913043,
 'dev_score_history': [0.7875000000000001,
  0.8057553956834533,
  0.8369426751592357,
  0.8364576599870719,
  0.852733118971061,
  0.854074542008844,
  0.8353464717101081,
  0.8511716276124129,
  0.8602564102564102,
  0.8692356285533797,
  0.8547979797979799,
  0.8443305573350417,
  0.8726355611601513,
  0.8820960698689956,
  0.8488372093023254,
  0.8782051282051283,
  0.8702570379436965,
  0.8732042473454091,
  0.8766773162939298,
  0.8714733542319749,
  0.8714462299134734,
  0.8664987405541562,
  0.8810572687224669,
  0.8756281407035175,
  0.87375,
  0.8734335839598997,
  0.8763474952441344,
  0.8765743073047858,
  0.8737373737373738,
  0.871374527112232,
  0.8735053492762744,
  0.8724747474747474,
  0.8768161718256475,
  0.8761785040854808,
  0.8785399622404028,
  0.8769716088328076,
  0.8779874213836477,
  0.8744479495268138,
  0.8769716088328076,
  0.8765743073047858,
  0.8771266540642723,
  0.8771266540642723,
  0.8771266540642723,
  0.87712665