# Named entity recognition with `chaine`

In [1]:
import chaine
import datasets

## Loading a dataset

#### Raw data

In [2]:
dataset = datasets.load_dataset("germeval_14")

Reusing dataset germ_eval14 (/home/severin/.cache/huggingface/datasets/germ_eval14/germeval_14/2.0.0/8c1a4d4b97bceb2f000694b664fda792b29fa486fbfbb1d865d375acf2acff6c)


In [3]:
print(f"Sequence: {dataset['train']['tokens'][0]}")
print(f"Labels: {dataset['train']['ner_tags'][0]}")

Sequence: ['Schartau', 'sagte', 'dem', '"', 'Tagesspiegel', '"', 'vom', 'Freitag', ',', 'Fischer', 'sei', '"', 'in', 'einer', 'Weise', 'aufgetreten', ',', 'die', 'alles', 'andere', 'als', 'überzeugend', 'war', '"', '.']
Labels: [19, 0, 0, 0, 7, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#### Transform to proper sequences

In [4]:
token_sequences = chaine.token_sequences(dataset["train"]["tokens"])
label_sequences = chaine.label_sequences(dataset["train"]["ner_tags"])

In [5]:
next(token_sequences)

<TokenSequence: [<Token 0: Schartau>, <Token 1: sagte>, <Token 2: dem>, <Token 3: ">, <Token 4: Tagesspiegel>, <Token 5: ">, <Token 6: vom>, <Token 7: Freitag>, <Token 8: ,>, <Token 9: Fischer>, <Token 10: sei>, <Token 11: ">, <Token 12: in>, <Token 13: einer>, <Token 14: Weise>, <Token 15: aufgetreten>, <Token 16: ,>, <Token 17: die>, <Token 18: alles>, <Token 19: andere>, <Token 20: als>, <Token 21: überzeugend>, <Token 22: war>, <Token 23: ">, <Token 24: .>]>

In [6]:
next(label_sequences)

<LabelSequence: ['19', '0', '0', '0', '7', '0', '0', '0', '0', '19', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']>

## Training a model using the high-level API

In [7]:
crf = chaine.train(token_sequences, label_sequences, max_iterations=10)

[2020-12-01 22:11:45,648] [INFO] Loading data
[2020-12-01 22:11:48,959] [INFO] Processed sequences: 10000
[2020-12-01 22:11:52,213] [INFO] Processed sequences: 20000
[2020-12-01 22:11:53,537] [INFO] Start training
[2020-12-01 22:11:56,444] [INFO] Iteration: 1	Loss: 123113.674920
[2020-12-01 22:11:57,723] [INFO] Iteration: 2	Loss: 104047.092895
[2020-12-01 22:11:59,013] [INFO] Iteration: 3	Loss: 97503.745447
[2020-12-01 22:12:00,307] [INFO] Iteration: 4	Loss: 93356.046698
[2020-12-01 22:12:01,600] [INFO] Iteration: 5	Loss: 90501.864825
[2020-12-01 22:12:02,897] [INFO] Iteration: 6	Loss: 88175.746359
[2020-12-01 22:12:04,197] [INFO] Iteration: 7	Loss: 86161.561674
[2020-12-01 22:12:05,590] [INFO] Iteration: 8	Loss: 84571.676818
[2020-12-01 22:12:06,905] [INFO] Iteration: 9	Loss: 83597.010195
[2020-12-01 22:12:08,214] [INFO] Iteration: 10	Loss: 82401.065625
[2020-12-01 22:12:08,216] [INFO] Iteration: None	Loss: 82401.065625


In [8]:
token_sequence = next(chaine.token_sequences(dataset["train"]["tokens"]))

In [9]:
crf.predict(token_sequence.featurize())

['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

## Custom features and lower-level training

In [1]:
import spacy
from chaine.data import Token, TokenSequence

In [23]:
help(TokenSequence.featurize)

Help on function featurize in module chaine.data:

featurize(self) -> Generator[List[str], NoneType, NoneType]
    Extract features from tokens of the sequence
    
    Note
    ----
    Overwrite this method for custom feature extraction – which is generally
    recommended as the default features may result in very low accuracy.
    
    One token is represented as a set of strings, each string is a unique feature,
    e.g. the string representation of the current token.



In [3]:
import spacy
nlp = spacy.load('de')
nlp.tokenizer = nlp.tokenizer.tokens_from_list

for doc in nlp.pipe([['I', 'like', 'cookies', '.'], ['Do', 'you', '?']]):
    for token in doc:
        print(token, token.pos_)

OSError: [E050] Can't find model 'de'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [48]:
class CustomTokenSequence(TokenSequence):
    def __post_init__(self):
        self.nlp = spacy.load("de_core_news_sm")
        self.nlp.tokenizer = self.nlp.tokenizer.tokens_from_list
    
    def featurize(self):
        pos = [t.pos_ for s in nlp.pipe([self.items]) for t in s]   
        
        for token, tag in zip(self.items, pos):
            token.pos = tag
        
        for token in self.items:
            features = {
                f"token.lower()={token.lower()}",
                f"token.is_upper()={token.is_upper}",
                f"token.is_title()={token.is_title}",
                f"token.is_digit()={token.is_digit}",
                f"token.pos={token.pos}",
            }

            if token.index > 0:
                left_token = self.items[token.index - 1]
                features.update(
                    {
                        f"-1:token.lower()={left_token.lower()}",
                        f"-1:token.is_title()={left_token.is_title}",
                        f"-1:token.is_upper()={left_token.is_upper}",
                        f"-1:token.pos={left_token.pos}",
                    }
                )
            else:
                features.add("BOS=True")

            if token.index < max(self.indices):
                right_token = self.items[token.index + 1]
                features.update(
                    {
                        f"+1:token.lower()={right_token.lower()}",
                        f"+1:token.is_title()={right_token.is_title}",
                        f"+1:token.is_upper()={right_token.is_upper}",
                        f"+1:token.pos={right_token.pos}",
                    }
                )
            else:
                features.add("EOS=True")
            yield features

In [49]:
tokens

[<Token 0: John>,
 <Token 1: Lennon>,
 <Token 2: was>,
 <Token 3: in>,
 <Token 4: The>,
 <Token 5: Beatles.>]

In [50]:
next(CustomTokenSequence(tokens).featurize())

{'+1:token.is_title()=True',
 '+1:token.is_upper()=False',
 '+1:token.lower()=lennon',
 '+1:token.pos=NNP',
 'BOS=True',
 'token.is_digit()=False',
 'token.is_title()=True',
 'token.is_upper()=False',
 'token.lower()=john',
 'token.pos=NNP'}

In [57]:
token_sequences = [CustomTokenSequence(s.items) for s in token_sequences]

In [None]:
crf = chaine.train(sequences, label_sequences, max_iterations=100)

[2020-11-30 23:16:06,431] [INFO] Loading data
