# Named entity recognition with `chaine`

In [1]:
import chaine
import datasets

## Loading a dataset

#### Raw data

In [2]:
dataset = datasets.load_dataset("germeval_14")

Reusing dataset germ_eval14 (/home/severin/.cache/huggingface/datasets/germ_eval14/germeval_14/2.0.0/8c1a4d4b97bceb2f000694b664fda792b29fa486fbfbb1d865d375acf2acff6c)


In [3]:
print(f"Sequence: {dataset['train']['tokens'][0]}")
print(f"Labels: {dataset['train']['ner_tags'][0]}")

Sequence: ['Schartau', 'sagte', 'dem', '"', 'Tagesspiegel', '"', 'vom', 'Freitag', ',', 'Fischer', 'sei', '"', 'in', 'einer', 'Weise', 'aufgetreten', ',', 'die', 'alles', 'andere', 'als', 'überzeugend', 'war', '"', '.']
Labels: [19, 0, 0, 0, 7, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#### Transform to proper sequences

In [44]:
token_sequences = chaine.token_sequences(dataset["train"]["tokens"])
label_sequences = chaine.label_sequences(dataset["train"]["ner_tags"])

In [6]:
next(label_sequences)

<LabelSequence: ['19', '0', '0', '0', '7', '0', '0', '0', '0', '19', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']>

## Training a model using the high-level API

In [7]:
crf = chaine.train(token_sequences, label_sequences, max_iterations=10)

[2020-12-01 22:42:51,829] [INFO] Loading data
[2020-12-01 22:42:55,196] [INFO] Processed sequences: 10000
[2020-12-01 22:42:58,442] [INFO] Processed sequences: 20000
[2020-12-01 22:42:59,757] [INFO] Start training
[2020-12-01 22:43:02,738] [INFO] Iteration: 1	Loss: 123113.674920
[2020-12-01 22:43:04,048] [INFO] Iteration: 2	Loss: 104047.092895
[2020-12-01 22:43:05,378] [INFO] Iteration: 3	Loss: 97503.745447
[2020-12-01 22:43:06,696] [INFO] Iteration: 4	Loss: 93356.046698
[2020-12-01 22:43:08,014] [INFO] Iteration: 5	Loss: 90501.864825
[2020-12-01 22:43:09,392] [INFO] Iteration: 6	Loss: 88175.746359
[2020-12-01 22:43:11,047] [INFO] Iteration: 7	Loss: 86161.561674
[2020-12-01 22:43:12,418] [INFO] Iteration: 8	Loss: 84571.676818
[2020-12-01 22:43:13,804] [INFO] Iteration: 9	Loss: 83597.010195
[2020-12-01 22:43:15,143] [INFO] Iteration: 10	Loss: 82401.065625


In [8]:
token_sequence = next(chaine.token_sequences(dataset["train"]["tokens"]))

In [9]:
crf.predict(token_sequence.featurize())

['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

## Custom features and lower-level training

In [10]:
import spacy
from chaine.data import Token, TokenSequence

In [56]:
token_sequences = chaine.token_sequences(dataset["train"]["tokens"])
label_sequences = chaine.label_sequences(dataset["train"]["ner_tags"])

In [58]:
nlp = spacy.load("de_core_news_sm")
nlp.tokenizer = nlp.tokenizer.tokens_from_list

def featurize(sequence):
    pos = [t.pos_ for s in nlp.pipe([[token.text for token in sequence]]) for t in s]   
    for token, tag in zip(sequence, pos):
        token.pos = tag
    return sequence

In [59]:
token_sequence = [featurize(sequence) for sequence in token_sequences]

In [62]:
crf = chaine.train(token_sequence, label_sequences, max_iterations=100)

[2020-12-01 23:41:21,463] [INFO] Loading data
[2020-12-01 23:41:24,606] [INFO] Processed sequences: 10000
[2020-12-01 23:41:27,807] [INFO] Processed sequences: 20000
[2020-12-01 23:41:29,103] [INFO] Start training
[2020-12-01 23:41:32,197] [INFO] Iteration: 1	Loss: 122622.147907
[2020-12-01 23:41:33,536] [INFO] Iteration: 2	Loss: 104254.210353
[2020-12-01 23:41:34,879] [INFO] Iteration: 3	Loss: 97574.764608
[2020-12-01 23:41:36,227] [INFO] Iteration: 4	Loss: 93279.302997
[2020-12-01 23:41:37,573] [INFO] Iteration: 5	Loss: 90209.168042
[2020-12-01 23:41:38,916] [INFO] Iteration: 6	Loss: 88087.504562
[2020-12-01 23:41:40,255] [INFO] Iteration: 7	Loss: 86266.022802
[2020-12-01 23:41:41,606] [INFO] Iteration: 8	Loss: 84801.256419
[2020-12-01 23:41:43,011] [INFO] Iteration: 9	Loss: 83681.514502
[2020-12-01 23:41:44,361] [INFO] Iteration: 10	Loss: 82427.379267
[2020-12-01 23:41:45,717] [INFO] Iteration: 11	Loss: 81578.665727
[2020-12-01 23:41:47,096] [INFO] Iteration: 12	Loss: 80642.752768
[

In [63]:
label_sequences = chaine.label_sequences(dataset["test"]["ner_tags"])
token_sequences = [featurize(sequence) for sequence in chaine.token_sequences(dataset["test"]["tokens"])]

In [71]:
token_sequences = [list(t.featurize()) for t in token_sequences]

In [73]:
prediction = crf.predict(token_sequences)

SystemError: <method 'predict' of 'chaine.crf.Model' objects> returned a result with an error set

In [79]:
predictions = [crf.predict(t) for t in token_sequences]

In [85]:
from seqeval import metrics

In [95]:
label_sequences = [s.items for s in chaine.label_sequences(dataset["test"]["ner_tags"])]

In [98]:
print(metrics.sequence_labeling.classification_report(label_sequences, predictions))

              precision    recall  f1-score   support

           0       0.79      0.74      0.76       810
           1       1.00      0.01      0.01       183
           2       0.00      0.00      0.00         4
           3       0.78      0.26      0.39       738
           4       0.48      0.16      0.24       351
           5       1.00      0.13      0.23        39
           7       0.00      0.00      0.00        42
           9       0.84      0.45      0.59      1633
           _       0.33      0.15      0.21      2327

   micro avg       0.63      0.32      0.42      6127
   macro avg       0.58      0.21      0.27      6127
weighted avg       0.61      0.32      0.40      6127

