https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR_TUTORIAL_1_TAGGING.md

https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR_CORPORA.md

In [None]:
!pip install flair --ignore-installed

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz

In [None]:
#replace data.py in site-packages/flair with data.py from https://github.com/flairNLP/flair/blob/master/flair/data.py

In [None]:
from flair.data import Sentence
from flair.models import MultiTagger
from flair.tokenization import SciSpacyTokenizer

In [None]:
# make a sentence and tokenize with SciSpaCy
sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome",
                    use_tokenizer=SciSpacyTokenizer())

# load biomedical tagger
tagger = MultiTagger.load("hunflair")

# tag sentence
tagger.predict(sentence)

In [None]:
for entity in sentence.get_spans():
    print(entity)

In [None]:
from flair.data import Sentence

sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")

In [None]:
# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence.to_tagged_string())

In [None]:
for disease in sentence.get_spans("hunflair-disease"):
    print(disease)

In [None]:
print(sentence.to_dict("hunflair-disease"))

In [None]:
for entity in sentence.get_spans():
    print(entity)

In [None]:
from flair.tokenization import SciSpacyTokenizer

sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome",  
                    use_tokenizer=SciSpacyTokenizer())

In [None]:
abstract = "Fragile X syndrome (FXS) is a developmental disorder caused by a mutation in the X-linked FMR1 gene, " \
           "coding for the FMRP protein which is largely involved in synaptic function. FXS patients present several " \
           "behavioral abnormalities, including hyperactivity, anxiety, sensory hyper-responsiveness, and cognitive " \
           "deficits. Autistic symptoms, e.g., altered social interaction and communication, are also often observed: " \
           "FXS is indeed the most common monogenic cause of autism."

In [None]:
from flair.tokenization import SciSpacySentenceSplitter

# initialize the sentence splitter
splitter = SciSpacySentenceSplitter()

# split text into a list of Sentence objects
sentences = splitter.split(abstract)

# you can apply the HunFlair tagger directly to this list
tagger.predict(sentences)

In [None]:
for sentence in sentences:
    print(sentence.to_tagged_string())

Here is example code for a biomedical NER model trained over NCBI_DISEASE corpus, using word embeddings and flair embeddings based on biomedical abstracts from PubMed and full-texts from PMC.

In [None]:
from flair.datasets import NCBI_DISEASE

In [None]:
# 1. get the corpus
corpus = NCBI_DISEASE()
print(corpus)

In [None]:
# 2. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")

In [None]:
# 3. initialize embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings

embedding_types = [

    # word embeddings trained on PubMed and PMC
    WordEmbeddings("pubmed"),

    # flair embeddings trained on PubMed and PMC
    FlairEmbeddings("pubmed-forward"),
    FlairEmbeddings("pubmed-backward"),
]


embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# 4. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type="ner",
    use_crf=True,
    locked_dropout=0.5
)

In [None]:
# 5. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train(
    base_path="taggers/ncbi-disease",
    train_with_dev=False,
    max_epochs=200,
    learning_rate=0.1,
    mini_batch_size=32
)

In [None]:
# 5. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train(
    base_path="taggers/ncbi-disease",
    train_with_dev=False,
    max_epochs=200,
    learning_rate=0.1,
    mini_batch_size=32
)

In [None]:
# load the model you trained
model = SequenceTagger.load("taggers/ncbi-disease/best-model.pt")

# create example sentence
from flair.data import Sentence
sentence = Sentence("Women who smoke 20 cigarettes a day are four times more likely to develop breast cancer.")

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())

In [None]:
sentence = Sentence("High-risk children should be treated with an intravenous antipseudomonal beta lactam agent, unless there is suspicion of multi-drug resistance when an antibiotic combination should be used, in unresponsive cases, more invasive procedures, including bronchoalveolar lavage (BAL), computed tomography (CT)-guided fine-needle aspiration or open lung biopsy (OLB), are recommended.")
# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())

# Fine-tuning HunFlair models

In the following example we fine-tune the hunflar-disease model to the NCBI_DISEASE:

In [None]:
# 1. load your target corpus
from flair.datasets import NCBI_DISEASE
corpus = NCBI_DISEASE()

# 2. load the pre-trained sequence tagger
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger.load("hunflair-disease")

# 3. initialize trainer
from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 4. fine-tune on the target corpus
trainer.train(
    base_path="taggers/hunflair-disease-finetuned-ncbi",
    train_with_dev=False,
    max_epochs=200,
    learning_rate=0.1,
    mini_batch_size=32
)

# Training HunFlair from scratch

In [None]:
from flair.datasets import HUNER_CELL_LINE

# 1. get all corpora for a specific entity type
from flair.models import SequenceTagger
corpus = HUNER_CELL_LINE()

# 2. initialize embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
embedding_types = [
    WordEmbeddings("pubmed"),
    FlairEmbeddings("pubmed-forward"),
    FlairEmbeddings("pubmed-backward"),

]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 3. initialize sequence tagger
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")

tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type="ner",
    use_crf=True,
    locked_dropout=0.5
)

# 4. train the model
from flair.trainers import ModelTrainer
trainer = ModelTrainer(tagger, corpus)

trainer.train(
    base_path="taggers/hunflair-cell-line", 
    train_with_dev=False, 
    max_epochs=200,
    learning_rate=0.1, 
    mini_batch_size=32
)

In [None]:
#Analogously, distinct models can be trained for chemicals, diseases, genes/proteins and species using HUNER_CHEMICALS, HUNER_DISEASE, HUNER_GENE, HUNER_SPECIES respectively.