# Named entity recognition with `chaine`

In [1]:
import chaine
import datasets

## Loading a dataset

#### Raw data

In [2]:
dataset = datasets.load_dataset("germeval_14")

In [3]:
print(f"Sequence: {dataset['train']['tokens'][0]}")
print(f"Labels: {dataset['train']['ner_tags'][0]}")

Sequence: ['Schartau', 'sagte', 'dem', '"', 'Tagesspiegel', '"', 'vom', 'Freitag', ',', 'Fischer', 'sei', '"', 'in', 'einer', 'Weise', 'aufgetreten', ',', 'die', 'alles', 'andere', 'als', 'überzeugend', 'war', '"', '.']
Labels: [19, 0, 0, 0, 7, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#### Transform to proper sequences

In [4]:
token_sequences = chaine.token_sequences(dataset["train"]["tokens"])
label_sequences = chaine.label_sequences(dataset["train"]["ner_tags"])

In [5]:
next(token_sequences)

<TokenSequence: [<Token 0: Schartau>, <Token 1: sagte>, <Token 2: dem>, <Token 3: ">, <Token 4: Tagesspiegel>, <Token 5: ">, <Token 6: vom>, <Token 7: Freitag>, <Token 8: ,>, <Token 9: Fischer>, <Token 10: sei>, <Token 11: ">, <Token 12: in>, <Token 13: einer>, <Token 14: Weise>, <Token 15: aufgetreten>, <Token 16: ,>, <Token 17: die>, <Token 18: alles>, <Token 19: andere>, <Token 20: als>, <Token 21: überzeugend>, <Token 22: war>, <Token 23: ">, <Token 24: .>]>

In [6]:
next(label_sequences)

<LabelSequence: ['19', '0', '0', '0', '7', '0', '0', '0', '0', '19', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']>

## Training a model using the high-level API

In [7]:
crf = chaine.train(token_sequences, label_sequences, max_iterations=10)

[2020-11-29 22:48:56,977] [INFO] Loading data
[2020-11-29 22:49:05,301] [INFO] Start training
[2020-11-29 22:49:08,678] [INFO] Iteration: 1	Loss: 549913.612197
[2020-11-29 22:49:09,796] [INFO] Iteration: 2	Loss: 377078.992349
[2020-11-29 22:49:13,021] [INFO] Iteration: 3	Loss: 267322.682539
[2020-11-29 22:49:15,353] [INFO] Iteration: 4	Loss: 248339.616374
[2020-11-29 22:49:17,698] [INFO] Iteration: 5	Loss: 233494.412198
[2020-11-29 22:49:18,755] [INFO] Iteration: 6	Loss: 223343.503914
[2020-11-29 22:49:19,795] [INFO] Iteration: 7	Loss: 208772.524234
[2020-11-29 22:49:20,923] [INFO] Iteration: 8	Loss: 190119.766701
[2020-11-29 22:49:22,212] [INFO] Iteration: 9	Loss: 178851.893751
[2020-11-29 22:49:23,356] [INFO] Iteration: 10	Loss: 170839.004304


In [8]:
token_sequence = next(chaine.token_sequences(dataset["train"]["tokens"]))

In [9]:
crf.predict(token_sequence.featurize())

['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']