# Sequences tagging using flair

## I. Data preparation

### I.1. Tokenization

In [2]:
from flair.data import Sentence

# make a sentence
#sentence = Sentence('Karim bought a Lenovo computer with over 70000 DZD when he was in Algiers, Algeria.')

sentence = Sentence("I am going to visit Dr. Watson. He is in U.K.")
sentence.tokens

[Token: 1 I,
 Token: 2 am,
 Token: 3 going,
 Token: 4 to,
 Token: 5 visit,
 Token: 6 Dr.,
 Token: 7 Watson,
 Token: 8 .,
 Token: 9 He,
 Token: 10 is,
 Token: 11 in,
 Token: 12 U.K,
 Token: 13 .]

In [3]:
# introduce a tokenized sentence
sentence2 = Sentence(['Karim', 'bought', 'a', 'Lenovo', 'computer'])

sentence2.tokens

[Token: 1 Karim,
 Token: 2 bought,
 Token: 3 a,
 Token: 4 Lenovo,
 Token: 5 computer]

In [4]:
# use another tokenizer
from flair.tokenization import JapaneseTokenizer

# init japanese tokenizer
ja_tokenizer = JapaneseTokenizer("janome")

# make sentence (and tokenize)
sentence3 = Sentence('私はESIの先生です。毎日、そこに行きます。', use_tokenizer=ja_tokenizer)

sentence3.tokens

[Token: 1 私,
 Token: 2 は,
 Token: 3 ESI,
 Token: 4 の,
 Token: 5 先生,
 Token: 6 です,
 Token: 7 。,
 Token: 8 毎日,
 Token: 9 、,
 Token: 10 そこ,
 Token: 11 に,
 Token: 12 行き,
 Token: 13 ます,
 Token: 14 。]

In [5]:
from flair.data import Sentence, Token, Tokenizer
from typing import List
import re

class ArTokenizer(Tokenizer):
    
    def __init__(self):
        super(ArTokenizer, self).__init__()
        self.punct = re.compile(r'^(.*)([،:.,,؟!])$')

    def tokenize(self, text: str) -> List[Token]:
        words = text.split()
        tokens: List[Token] = []
        for word in words:
            m = self.punct.match(word)
            if m:
                tokens.append(Token(m.group(1)))
                tokens.append(Token(m.group(2)))
            else:
                tokens.append(Token(word))
        return tokens
        
        

# init arabic tokenizer
ar_tokenizer = ArTokenizer()

ar_sentence = Sentence("أنا ذاهب إلى السوق. هل تريد أن أحضر لك شيء ما؟ هكذا إذن! نلتقي بعد أن أعود.", use_tokenizer=ar_tokenizer)

ar_sentence.tokens

[Token: 1 أنا,
 Token: 2 ذاهب,
 Token: 3 إلى,
 Token: 4 السوق,
 Token: 5 .,
 Token: 6 هل,
 Token: 7 تريد,
 Token: 8 أن,
 Token: 9 أحضر,
 Token: 10 لك,
 Token: 11 شيء,
 Token: 12 ما,
 Token: 13 ؟,
 Token: 14 هكذا,
 Token: 15 إذن,
 Token: 16 !,
 Token: 17 نلتقي,
 Token: 18 بعد,
 Token: 19 أن,
 Token: 20 أعود,
 Token: 21 .]

### I.2. Corpus preparation

In [6]:
#existing corpora in flair
import flair.datasets
ud_en_corpus = flair.datasets.UD_ENGLISH()

print('______ Access a sentence ______')
print(ud_en_corpus.test[0])

print('______ PoS-tagged sentence ______')
print(ud_en_corpus.test[0].to_tagged_string('pos'))


2021-09-30 11:52:33,577 Reading data from /home/kariminf/.flair/datasets/ud_english
2021-09-30 11:52:33,579 Train: /home/kariminf/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2021-09-30 11:52:33,581 Dev: /home/kariminf/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2021-09-30 11:52:33,583 Test: /home/kariminf/.flair/datasets/ud_english/en_ewt-ud-test.conllu
______ Access a sentence ______
Sentence: "What if Google Morphed Into GoogleOS ?"   [− Tokens: 7  − Token-Labels: "What <what/PRON/WP/root/Int> if <if/SCONJ/IN/mark> Google <Google/PROPN/NNP/nsubj/Sing> Morphed <morph/VERB/VBD/advcl/Ind/Past/Fin> Into <into/ADP/IN/case> GoogleOS <GoogleOS/PROPN/NNP/obl/Sing> ? <?/PUNCT/./punct>"]
______ PoS-tagged sentence ______
What <WP> if <IN> Google <NNP> Morphed <VBD> Into <IN> GoogleOS <NNP> ? <.>


In [7]:
#creating your own corpus
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'pos', 2: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '.'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='flair_train.txt',
                              test_file='flair_test.txt',
                              dev_file='flair_dev.txt')

2021-09-30 11:52:40,994 Reading data from .
2021-09-30 11:52:40,996 Train: flair_train.txt
2021-09-30 11:52:40,997 Dev: flair_dev.txt
2021-09-30 11:52:40,997 Test: flair_test.txt


In [8]:
corpus.train[0].to_tagged_string('pos')

'un <DET> ordianteur <NOUN> peut <VERB> vous <PRON> aider <VERB>'

## II. Part of Speech (PoS) tagging

### II.1. Tagging

In [9]:
from flair.models import SequenceTagger

# load the PoS tagger
pos_tagger = SequenceTagger.load('pos')

# run PoS over sentence
pos_tagger.predict(sentence)

del pos_tagger

2021-09-30 11:52:41,240 --------------------------------------------------------------------------------
2021-09-30 11:52:41,242 The model key 'pos' now maps to 'https://huggingface.co/flair/pos-english' on the HuggingFace ModelHub
2021-09-30 11:52:41,243  - The most current version of the model is automatically downloaded from there.
2021-09-30 11:52:41,245  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/pos/en-pos-ontonotes-v0.5.pt)
2021-09-30 11:52:41,247 --------------------------------------------------------------------------------
2021-09-30 11:52:42,344 loading file /home/kariminf/.flair/models/pos-english/a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63


In [10]:
for entity in sentence.get_spans('pos'):
    print(entity.text, entity.tag, entity.score)

I PRP 0.9999998807907104
am VBP 0.9999998807907104
going VBG 1.0
to TO 0.9994309544563293
visit VB 0.9999971389770508
Dr. NNP 0.9999998807907104
Watson NNP 1.0
. . 0.9994010925292969
He PRP 1.0
is VBZ 0.9999998807907104
in IN 0.9999984502792358
U.K NNP 0.9999973773956299
. . 0.9999960660934448


### II.2. Training

In [11]:
# 1. get the corpus
# we will use the corpus we created earlier 

# 2. what label do we want to predict?
pos_label_type = 'pos'

# 3. make the label dictionary from the corpus
pos_label_dict = corpus.make_label_dictionary(label_type=pos_label_type)

pos_label_dict

2021-09-30 11:52:48,348 Computing label dictionary. Progress:


100%|██████████| 6/6 [00:00<00:00, 2280.13it/s]

2021-09-30 11:52:48,360 Corpus contains the labels: pos (#25), ner (#24)
2021-09-30 11:52:48,361 Created (for label 'pos') Dictionary with 4 tags: DET, NOUN, VERB, PRON





<flair.data.Dictionary at 0x7f32d6f3fe50>

In [12]:
from flair.embeddings import WordEmbeddings, StackedEmbeddings, CharacterEmbeddings

# 4. initialize embeddings
pos_embedding_types = [

    WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
]

pos_embeddings = StackedEmbeddings(embeddings=pos_embedding_types)

pos_embeddings

StackedEmbeddings(
  (list_embedding_0): WordEmbeddings('glove')
  (list_embedding_1): CharacterEmbeddings(
    (char_embedding): Embedding(275, 25)
    (char_rnn): LSTM(25, 25, bidirectional=True)
  )
)

In [13]:
from flair.models import SequenceTagger

# 5. initialize sequence tagger
pos_tagger = SequenceTagger(hidden_size=10,
                        embeddings=pos_embeddings,
                        tag_dictionary=pos_label_dict,
                        tag_type=pos_label_type,
                        use_crf=True)

pos_tagger

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): CharacterEmbeddings(
      (char_embedding): Embedding(275, 25)
      (char_rnn): LSTM(25, 25, bidirectional=True)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=150, out_features=150, bias=True)
  (rnn): LSTM(150, 10, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=20, out_features=6, bias=True)
)

In [14]:
from flair.trainers import ModelTrainer

pos_model_path = '/home/kariminf/Data/tutoriel/flair_pos.tagger.fr'

# 6. initialize trainer
trainer = ModelTrainer(pos_tagger, corpus)

# 7. start training
trainer.train(pos_model_path,
              learning_rate=0.1,
              mini_batch_size=10, 
              max_epochs=5)

del trainer
del pos_tagger

2021-09-30 11:52:53,112 ----------------------------------------------------------------------------------------------------
2021-09-30 11:52:53,114 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): CharacterEmbeddings(
      (char_embedding): Embedding(275, 25)
      (char_rnn): LSTM(25, 25, bidirectional=True)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=150, out_features=150, bias=True)
  (rnn): LSTM(150, 10, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=20, out_features=6, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2021-09-30 11:52:53,116 ----------------------------------------------------------------------------------------------------
2021-09-30 11:52:53,118 Corpus: "Corpus: 6 train + 4 dev + 2 test sentences"
2021-09-30 11:52:53,121 --------------------------------------

In [15]:
# load the model you trained
pos_load_model = SequenceTagger.load(pos_model_path + "/best-model.pt")

# create example sentence
sentence_fr = Sentence('il peut aider')

# predict tags and print
pos_load_model.predict(sentence_fr)

sentence_fr.to_tagged_string()

2021-09-30 11:53:08,666 loading file /home/kariminf/Data/tutoriel/flair_pos.tagger.fr/best-model.pt


'il <PRON> peut <VERB> aider <VERB>'

## III. Named entity recognition (NER)

### III.1. Recognition

In [16]:
from flair.models import SequenceTagger


# load the NER tagger
ner_tagger = SequenceTagger.load('ner')

# run NER over sentence
ner_tagger.predict(sentence)

del ner_tagger

2021-09-30 11:53:11,006 --------------------------------------------------------------------------------
2021-09-30 11:53:11,007 The model key 'ner' now maps to 'https://huggingface.co/flair/ner-english' on the HuggingFace ModelHub
2021-09-30 11:53:11,008  - The most current version of the model is automatically downloaded from there.
2021-09-30 11:53:11,009  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner/en-ner-conll03-v0.4.pt)
2021-09-30 11:53:11,010 --------------------------------------------------------------------------------
2021-09-30 11:53:11,965 loading file /home/kariminf/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4


In [17]:
for entity in sentence.get_spans('ner'):
    print(entity.text, entity.tag, entity.score)


Dr. Watson PER 0.7794725894927979
U.K LOC 0.9533657431602478


In [18]:
# load model
# The model is 500MB, it is so heavy
ar_ner_tagger = SequenceTagger.load('ar-ner')


# predict NER tags
ar_ner_tagger.predict(ar_sentence)

del ar_ner_tagger

2021-09-30 11:53:30,020 --------------------------------------------------------------------------------
2021-09-30 11:53:30,022 The model key 'ar-ner' now maps to 'https://huggingface.co/megantosh/flair-arabic-multi-ner' on the HuggingFace ModelHub
2021-09-30 11:53:30,024  - The most current version of the model is automatically downloaded from there.
2021-09-30 11:53:30,025 --------------------------------------------------------------------------------
2021-09-30 11:53:31,001 loading file /home/kariminf/.flair/models/flair-arabic-multi-ner/c7af7ddef4fdcc681fcbe1f37719348afd2862b12aa1cfd4f3b93bd2d77282c7.242d030cb106124f7f9f6a88fb9af8e390f581d42eeca013367a86d585ee6dd6


In [19]:
# print sentence with predicted tags
for entity in ar_sentence.get_spans('ner'):
    print(entity)

### III.2. Training

In [20]:
# 1. get the corpus
# we will use the corpus we created earlier 

# 2. what label do we want to predict?
ner_label_type = 'ner'

# 3. make the label dictionary from the corpus
ner_label_dict = corpus.make_label_dictionary(label_type=ner_label_type)

ner_label_dict

2021-09-30 11:53:49,398 Computing label dictionary. Progress:


100%|██████████| 6/6 [00:00<00:00, 3963.75it/s]

2021-09-30 11:53:49,407 Corpus contains the labels: pos (#25), ner (#24)
2021-09-30 11:53:49,408 Created (for label 'ner') Dictionary with 4 tags: O, B-PER, I-PER, B-LOC





<flair.data.Dictionary at 0x7f32ef3cb8e0>

In [21]:
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings

# 4. initialize embeddings
ner_embedding_types = [

    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward')
]

ner_embeddings = StackedEmbeddings(embeddings=ner_embedding_types)

ner_embeddings

StackedEmbeddings(
  (list_embedding_0): WordEmbeddings('glove')
  (list_embedding_1): FlairEmbeddings(
    (lm): LanguageModel(
      (drop): Dropout(p=0.05, inplace=False)
      (encoder): Embedding(300, 100)
      (rnn): LSTM(100, 2048)
      (decoder): Linear(in_features=2048, out_features=300, bias=True)
    )
  )
  (list_embedding_2): FlairEmbeddings(
    (lm): LanguageModel(
      (drop): Dropout(p=0.05, inplace=False)
      (encoder): Embedding(300, 100)
      (rnn): LSTM(100, 2048)
      (decoder): Linear(in_features=2048, out_features=300, bias=True)
    )
  )
)

In [22]:
from flair.models import SequenceTagger

# 5. initialize sequence tagger
ner_tagger = SequenceTagger(hidden_size=10,
                        embeddings=ner_embeddings,
                        tag_dictionary=ner_label_dict,
                        tag_type=ner_label_type,
                        use_crf=True)

ner_tagger

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 10, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=20, out_features=6, bias=True)
)

In [23]:
from flair.trainers import ModelTrainer

ner_model_path = '/home/kariminf/Data/tutoriel/flair_ner.tagger.fr'

# 6. initialize trainer
trainer = ModelTrainer(ner_tagger, corpus)

# 7. start training
trainer.train(ner_model_path,
              learning_rate=0.1,
              mini_batch_size=10, 
              max_epochs=5)

2021-09-30 11:53:56,538 ----------------------------------------------------------------------------------------------------
2021-09-30 11:53:56,541 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 10, batch_first=True, bi

{'test_score': 0.0,
 'dev_score_history': [0.0, 0.0, 0.0, 0.0, 0.0],
 'train_loss_history': [1.317046324412028,
  0.7960201899210612,
  0.5113618771235148,
  0.34312907854715985,
  0.3348756631215413],
 'dev_loss_history': [tensor(0.9116),
  tensor(0.6329),
  tensor(0.5556),
  tensor(0.5189),
  tensor(0.4449)]}

In [24]:
# load the model you trained
pos_load_model = SequenceTagger.load(pos_model_path + "/best-model.pt")

# create example sentence
sentence_fr = Sentence('Karim peut aider')

# predict tags and print
pos_load_model.predict(sentence_fr)

sentence_fr.to_tagged_string()

2021-09-30 11:54:08,855 loading file /home/kariminf/Data/tutoriel/flair_pos.tagger.fr/best-model.pt


'Karim <VERB> peut <VERB> aider <VERB>'