In [None]:
#%%capture #capture install output
%pip install flair

In [1]:
# LER Corpus ist auch an bord von flair, load with 'NER_GERMAN_LEGAL'

from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '../data/ler/'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='ler_train.conll',
                              test_file='ler_test.conll',
                              dev_file='ler_dev.conll')

2023-01-09 21:18:17,360 Reading data from ../data/ler
2023-01-09 21:18:17,361 Train: ../data/ler/ler_train.conll
2023-01-09 21:18:17,361 Dev: ../data/ler/ler_dev.conll
2023-01-09 21:18:17,361 Test: ../data/ler/ler_test.conll


In [2]:
# LER Corpus ist auch an bord von flair, load with 'NER_GERMAN_LEGAL'
# TODO Wieso hat der keine Dev und Test splits? ich glaube der ursprüngliche hatte damals keine, jetzt aber schon!
import flair.datasets
corpus = flair.datasets.NER_GERMAN_LEGAL()

2023-01-09 21:21:09,085 Reading data from /home/stefan/.flair/datasets/ner_german_legal
2023-01-09 21:21:09,086 Train: /home/stefan/.flair/datasets/ner_german_legal/ler.conll
2023-01-09 21:21:09,086 Dev: None
2023-01-09 21:21:09,087 Test: None


In [3]:
corpus.downsample(0.01)

<flair.datasets.sequence_labeling.NER_GERMAN_LEGAL at 0x7fe9c27808e0>

In [4]:
# #sentences in train split
len(corpus.train)

540

In [5]:
# example sentence
print(corpus.train[5].to_tagged_string('ner'))

Sentence: "Das Arbeitsverhältnis des Klägers endete mit Ablauf des 30. April 2007 ."


In [6]:
# TODO hier ist ein <unk> label enthalten, somit 20 und nicht wie erwartet 19 tags, dem mal nachgehen,
#  oder ist damit einfach das O Tag gemeint

# 2. what label do we want to predict?
label_type = 'ner'

# 3. make the label dictionary from the corpus
tag_dictionary = corpus.make_label_dictionary(label_type=label_type)
print(tag_dictionary)

2023-01-09 21:21:38,956 Computing label dictionary. Progress:


540it [00:00, 62523.79it/s]

2023-01-09 21:21:38,990 Dictionary created for label 'ner' with 19 values: GS (seen 164 times), RS (seen 116 times), GRT (seen 28 times), VT (seen 25 times), LIT (seen 19 times), PER (seen 13 times), LD (seen 12 times), INN (seen 11 times), EUN (seen 10 times), ST (seen 7 times), VS (seen 6 times), UN (seen 6 times), RR (seen 3 times), LDS (seen 3 times), ORG (seen 2 times), MRK (seen 2 times), VO (seen 2 times), STR (seen 1 times)
Dictionary with 19 tags: <unk>, GS, RS, GRT, VT, LIT, PER, LD, INN, EUN, ST, VS, UN, RR, LDS, ORG, MRK, VO, STR





In [7]:
#import torch

# 4. initialize fine-tuneable transformer embeddings WITH document context
from flair.embeddings import TransformerWordEmbeddings

embeddings = TransformerWordEmbeddings(
    model='xlm-roberta-large',
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=True,
)

# 5. initialize bare-bones sequence tagger (no CRF, no RNN, no reprojection)
from flair.models import SequenceTagger

tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type='ner',
    use_crf=False,
    use_rnn=False,
    reproject_embeddings=False,
)

# 6. initialize trainer with AdamW optimizer
from flair.trainers import ModelTrainer

# trainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.AdamW)
trainer = ModelTrainer(tagger, corpus)

# 7. run training with XLM parameters (20 epochs, small LR)
from torch.optim.lr_scheduler import OneCycleLR

trainer.train('resources/taggers/ner-german-large',
              learning_rate=5.0e-6,
              mini_batch_size=4,
              mini_batch_chunk_size=1,
              max_epochs=20,
              scheduler=OneCycleLR,
              embeddings_storage_mode='none',
              weight_decay=0.,
              )


2023-01-09 21:21:51,073 SequenceTagger predicts: Dictionary with 73 tags: O, S-GS, B-GS, E-GS, I-GS, S-RS, B-RS, E-RS, I-RS, S-GRT, B-GRT, E-GRT, I-GRT, S-VT, B-VT, E-VT, I-VT, S-LIT, B-LIT, E-LIT, I-LIT, S-PER, B-PER, E-PER, I-PER, S-LD, B-LD, E-LD, I-LD, S-INN, B-INN, E-INN, I-INN, S-EUN, B-EUN, E-EUN, I-EUN, S-ST, B-ST, E-ST, I-ST, S-VS, B-VS, E-VS, I-VS, S-UN, B-UN, E-UN, I-UN, S-RR
2023-01-09 21:21:51,078 ----------------------------------------------------------------------------------------------------
2023-01-09 21:21:51,080 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    

OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB (GPU 0; 7.77 GiB total capacity; 4.18 GiB already allocated; 791.06 MiB free; 5.32 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF