# Sequences tagging using flair

In [None]:
%pip install flair

## I. Data preparation

### I.1. Tokenization

In [16]:
from flair.data import Sentence

# make a sentence
#sentence = Sentence('Karim bought a Lenovo computer with over 70000 DZD when he was in Algiers, Algeria.')

sentence = Sentence("I am going to visit Dr. Watson. He is as U.K.")
sentence.tokens

[Token: 1 I,
 Token: 2 am,
 Token: 3 going,
 Token: 4 to,
 Token: 5 visit,
 Token: 6 Dr.,
 Token: 7 Watson,
 Token: 8 .,
 Token: 9 He,
 Token: 10 is,
 Token: 11 as,
 Token: 12 U.K,
 Token: 13 .]

In [14]:
# introduce a tokenized sentence
sentence2 = Sentence(['Karim', 'bought', 'a', 'Lenovo', 'computer'])

sentence2.tokens

[Token: 1 Karim,
 Token: 2 bought,
 Token: 3 a,
 Token: 4 Lenovo,
 Token: 5 computer]

In [16]:
# use another tokenizer
from flair.tokenization import JapaneseTokenizer

# init japanese tokenizer
ja_tokenizer = JapaneseTokenizer("janome")

# make sentence (and tokenize)
sentence3 = Sentence('私はESIの先生です。毎日、そこに行きます。', use_tokenizer=ja_tokenizer)

sentence3.tokens

[Token: 1 私,
 Token: 2 は,
 Token: 3 ESI,
 Token: 4 の,
 Token: 5 先生,
 Token: 6 です,
 Token: 7 。,
 Token: 8 毎日,
 Token: 9 、,
 Token: 10 そこ,
 Token: 11 に,
 Token: 12 行き,
 Token: 13 ます,
 Token: 14 。]

In [15]:
from flair.data import Sentence, Token, Tokenizer
from typing import List
import re

class ArTokenizer(Tokenizer):
    
    def __init__(self):
        super(ArTokenizer, self).__init__()
        self.punct = re.compile(r'^(.*)([،:.,,؟!])$')

    def tokenize(self, text: str) -> List[Token]:
        words = text.split()
        tokens: List[Token] = []
        for word in words:
            m = self.punct.match(word)
            if m:
                tokens.append(Token(m.group(1)))
                tokens.append(Token(m.group(2)))
            else:
                tokens.append(Token(word))
        return tokens
        
        

# init arabic tokenizer
ar_tokenizer = ArTokenizer()

ar_sentence = Sentence("أنا ذاهب إلى السوق. هل تريد أن أحضر لك شيء ما؟ هكذا إذن! نلتقي بعد أن أعود.", use_tokenizer=ar_tokenizer)

ar_sentence.tokens

[Token: 1 أنا,
 Token: 2 ذاهب,
 Token: 3 إلى,
 Token: 4 السوق,
 Token: 5 .,
 Token: 6 هل,
 Token: 7 تريد,
 Token: 8 أن,
 Token: 9 أحضر,
 Token: 10 لك,
 Token: 11 شيء,
 Token: 12 ما,
 Token: 13 ؟,
 Token: 14 هكذا,
 Token: 15 إذن,
 Token: 16 !,
 Token: 17 نلتقي,
 Token: 18 بعد,
 Token: 19 أن,
 Token: 20 أعود,
 Token: 21 .]

### I.2. Corpus preparation

In [8]:
#existing corpora in flair
import flair.datasets
ud_en_corpus = flair.datasets.UD_ENGLISH()

print('______ Access a sentence ______')
print(ud_en_corpus.test[0])

print('______ PoS-tagged sentence ______')
print(ud_en_corpus.test[0].to_tagged_string('pos'))


2021-09-29 20:03:25,688 Reading data from /home/kariminf/.flair/datasets/ud_english
2021-09-29 20:03:25,689 Train: /home/kariminf/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2021-09-29 20:03:25,692 Dev: /home/kariminf/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2021-09-29 20:03:25,694 Test: /home/kariminf/.flair/datasets/ud_english/en_ewt-ud-test.conllu
______ Access a sentence ______
Sentence: "What if Google Morphed Into GoogleOS ?"   [− Tokens: 7  − Token-Labels: "What <what/PRON/WP/root/Int> if <if/SCONJ/IN/mark> Google <Google/PROPN/NNP/nsubj/Sing> Morphed <morph/VERB/VBD/advcl/Ind/Past/Fin> Into <into/ADP/IN/case> GoogleOS <GoogleOS/PROPN/NNP/obl/Sing> ? <?/PUNCT/./punct>"]
______ PoS-tagged sentence ______
What <WP> if <IN> Google <NNP> Morphed <VBD> Into <IN> GoogleOS <NNP> ? <.>


In [5]:
#creating your own corpus
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'pos', 2: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '.'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='flair_train.txt',
                              test_file='flair_test.txt',
                              dev_file='flair_dev.txt')

2021-09-29 20:31:16,457 Reading data from .
2021-09-29 20:31:16,459 Train: flair_train.txt
2021-09-29 20:31:16,459 Dev: flair_dev.txt
2021-09-29 20:31:16,460 Test: flair_test.txt


In [6]:
corpus.train[0].to_tagged_string('pos')

'un <DET> ordianteur <NOUN> peut <VERB> vous <PRON> aider <VERB>'

## II. Part of Speech (PoS) tagging

### II.1. Tagging

In [17]:
from flair.models import SequenceTagger

# load the PoS tagger
pos_tagger = SequenceTagger.load('pos')

# run PoS over sentence
pos_tagger.predict(sentence)

del pos_tagger

2021-09-29 15:15:36,712 --------------------------------------------------------------------------------
2021-09-29 15:15:36,713 The model key 'pos' now maps to 'https://huggingface.co/flair/pos-english' on the HuggingFace ModelHub
2021-09-29 15:15:36,715  - The most current version of the model is automatically downloaded from there.
2021-09-29 15:15:36,716  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/pos/en-pos-ontonotes-v0.5.pt)
2021-09-29 15:15:36,717 --------------------------------------------------------------------------------


Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

2021-09-29 15:16:24,644 loading file /home/kariminf/.flair/models/pos-english/a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63


In [18]:
for entity in sentence.get_spans('pos'):
    print(entity.text, entity.tag, entity.score)

Karim NNP 0.9999685287475586
bought VBD 0.9999796152114868
a DT 0.9999998807907104
Lenovo NNP 0.9999986886978149
computer NN 0.9999860525131226
with IN 1.0
over IN 0.9845556616783142
70000 CD 0.9999727010726929
DZD NNP 0.7187032103538513
when WRB 1.0
he PRP 0.9999998807907104
was VBD 0.9999997615814209
in IN 0.9999995231628418
Algiers NNP 0.9992132186889648
, , 1.0
Algeria NNP 0.9999998807907104
. . 0.999991774559021


### II.2. Training

In [8]:
# 1. get the corpus
# we will use the corpus we created earlier 

# 2. what label do we want to predict?
pos_label_type = 'pos'

# 3. make the label dictionary from the corpus
pos_label_dict = corpus.make_label_dictionary(label_type=pos_label_type)

pos_label_dict

2021-09-29 20:36:28,022 Computing label dictionary. Progress:


100%|██████████| 6/6 [00:00<00:00, 4891.32it/s]

2021-09-29 20:36:28,028 Corpus contains the labels: pos (#25), ner (#24)
2021-09-29 20:36:28,029 Created (for label 'pos') Dictionary with 4 tags: DET, NOUN, VERB, PRON





<flair.data.Dictionary at 0x7f4afd902bb0>

In [9]:
from flair.embeddings import WordEmbeddings, StackedEmbeddings

# 4. initialize embeddings
pos_embedding_types = [

    WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
]

pos_embeddings = StackedEmbeddings(embeddings=pos_embedding_types)

pos_embeddings

2021-09-29 20:37:02,667 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpr18kkm75


  6%|▌         | 9807872/160000128 [00:36<10:45, 232527.37B/s]

KeyboardInterrupt: 

In [None]:
from flair.models import SequenceTagger

# 5. initialize sequence tagger
pos_tagger = SequenceTagger(hidden_size=10,
                        embeddings=pos_embeddings,
                        tag_dictionary=pos_label_dict,
                        tag_type=pos_label_type,
                        use_crf=True)

pos_tagger

In [None]:
from flair.trainers import ModelTrainer

pos_model_path = '/home/kariminf/Data/tutoriel/flair_pos.tagger.fr'

# 6. initialize trainer
trainer = ModelTrainer(pos_tagger, corpus)

# 7. start training
trainer.train(pos_model_path,
              learning_rate=0.1,
              mini_batch_size=10, 
              max_epochs=5)

In [None]:
# load the model you trained
pos_load_model = SequenceTagger.load(pos_model_path)

# create example sentence
sentence_fr = Sentence('il peut aider')

# predict tags and print
pos_load_model.predict(sentence_fr)

sentence_fr.to_tagged_string()

## III. Named entity recognition (NER)

### III.1. Recognition

In [4]:
from flair.models import SequenceTagger


# load the NER tagger
ner_tagger = SequenceTagger.load('ner')

# run NER over sentence
ner_tagger.predict(sentence)

del ner_tagger

2021-09-29 14:34:11,093 --------------------------------------------------------------------------------
2021-09-29 14:34:11,094 The model key 'ner' now maps to 'https://huggingface.co/flair/ner-english' on the HuggingFace ModelHub
2021-09-29 14:34:11,096  - The most current version of the model is automatically downloaded from there.
2021-09-29 14:34:11,099  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner/en-ner-conll03-v0.4.pt)
2021-09-29 14:34:11,100 --------------------------------------------------------------------------------
2021-09-29 14:34:11,990 loading file /home/kariminf/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4


In [8]:
for entity in sentence.get_spans('ner'):
    print(entity.text, entity.tag, entity.score)


Karim PER 0.9950203895568848
Lenovo ORG 0.9930753707885742
DZD MISC 0.661797046661377
Algiers LOC 0.999290943145752
Algeria LOC 0.9998273253440857


In [24]:
# load model
# The model is 500MB, it is so heavy
ar_ner_tagger = SequenceTagger.load('ar-ner')


# predict NER tags
ar_ner_tagger.predict(ar_sentence)

del ar_ner_tagger

2021-09-29 15:57:25,556 --------------------------------------------------------------------------------
2021-09-29 15:57:25,574 The model key 'ar-ner' now maps to 'https://huggingface.co/megantosh/flair-arabic-multi-ner' on the HuggingFace ModelHub
2021-09-29 15:57:25,576  - The most current version of the model is automatically downloaded from there.
2021-09-29 15:57:25,579 --------------------------------------------------------------------------------
2021-09-29 15:57:26,589 loading file /home/kariminf/.flair/models/flair-arabic-multi-ner/c7af7ddef4fdcc681fcbe1f37719348afd2862b12aa1cfd4f3b93bd2d77282c7.242d030cb106124f7f9f6a88fb9af8e390f581d42eeca013367a86d585ee6dd6


In [26]:
# print sentence with predicted tags
for entity in ar_sentence.get_spans('ner'):
    print(entity)

### III.2. Training

In [17]:
# 1. get the corpus
# we will use the corpus we created earlier 

# 2. what label do we want to predict?
ner_label_type = 'ner'

# 3. make the label dictionary from the corpus
ner_label_dict = corpus.make_label_dictionary(label_type=ner_label_type)

ner_label_dict

2021-09-29 21:13:47,312 Computing label dictionary. Progress:



100%|██████████| 6/6 [00:00<00:00, 4854.52it/s]

2021-09-29 21:13:47,318 Corpus contains the labels: pos (#25), ner (#24)
2021-09-29 21:13:47,319 Created (for label 'ner') Dictionary with 4 tags: O, B-PER, I-PER, B-LOC





<flair.data.Dictionary at 0x7f4afcfa0070>

In [None]:
from flair.embeddings import WordEmbeddings, StackedEmbeddings

# 4. initialize embeddings
ner_embedding_types = [

    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward')
]

ner_embeddings = StackedEmbeddings(embeddings=ner_embedding_types)

ner_embeddings

In [None]:
from flair.models import SequenceTagger

# 5. initialize sequence tagger
ner_tagger = SequenceTagger(hidden_size=10,
                        embeddings=ner_embeddings,
                        tag_dictionary=ner_label_dict,
                        tag_type=ner_label_type,
                        use_crf=True)

ner_tagger

In [None]:
from flair.trainers import ModelTrainer

ner_model_path = '/home/kariminf/Data/tutoriel/flair_ner.tagger.fr'

# 6. initialize trainer
trainer = ModelTrainer(ner_tagger, corpus)

# 7. start training
trainer.train(ner_model_path,
              learning_rate=0.1,
              mini_batch_size=10, 
              max_epochs=5)

In [None]:
# load the model you trained
pos_load_model = SequenceTagger.load(pos_model_path)

# create example sentence
sentence_fr = Sentence('il peut aider')

# predict tags and print
pos_load_model.predict(sentence_fr)

sentence_fr.to_tagged_string()