In [1]:
from flair.data import TaggedCorpus
from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('crawl'),
    WordEmbeddings('twitter'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type)

# initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('models/flair/dutch-ner',
              max_epochs=150)

2019-03-18 11:08:13,217 Reading data from /home/snie/.flair/datasets/conll_03_dutch
2019-03-18 11:08:13,218 Train: /home/snie/.flair/datasets/conll_03_dutch/ned.train
2019-03-18 11:08:13,218 Dev: /home/snie/.flair/datasets/conll_03_dutch/ned.testa
2019-03-18 11:08:13,218 Test: /home/snie/.flair/datasets/conll_03_dutch/ned.testb
2019-03-18 11:08:13,223 UTF-8 can't read: /home/snie/.flair/datasets/conll_03_dutch/ned.train ... using "latin-1" instead.
2019-03-18 11:08:15,095 UTF-8 can't read: /home/snie/.flair/datasets/conll_03_dutch/ned.testb ... using "latin-1" instead.
2019-03-18 11:08:15,675 UTF-8 can't read: /home/snie/.flair/datasets/conll_03_dutch/ned.testa ... using "latin-1" instead.
2019-03-18 11:08:24,193 ----------------------------------------------------------------------------------------------------
2019-03-18 11:08:24,193 Evaluation method: MICRO_F1_SCORE
2019-03-18 11:08:24,195 ----------------------------------------------------------------------------------------------

KeyboardInterrupt: 