# Named entity recognition with `chaine`

In [1]:
import chaine
import datasets

## Loading a dataset

#### Raw data

In [2]:
dataset = datasets.load_dataset("germeval_14")

Reusing dataset germ_eval14 (/home/severin/.cache/huggingface/datasets/germ_eval14/germeval_14/2.0.0/8c1a4d4b97bceb2f000694b664fda792b29fa486fbfbb1d865d375acf2acff6c)


In [3]:
print(f"Sequence: {dataset['train']['tokens'][0]}")
print(f"Labels: {dataset['train']['ner_tags'][0]}")

Sequence: ['Schartau', 'sagte', 'dem', '"', 'Tagesspiegel', '"', 'vom', 'Freitag', ',', 'Fischer', 'sei', '"', 'in', 'einer', 'Weise', 'aufgetreten', ',', 'die', 'alles', 'andere', 'als', 'überzeugend', 'war', '"', '.']
Labels: [19, 0, 0, 0, 7, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#### Transform to proper sequences

In [44]:
token_sequences = chaine.token_sequences(dataset["train"]["tokens"])
label_sequences = chaine.label_sequences(dataset["train"]["ner_tags"])

In [6]:
next(label_sequences)

<LabelSequence: ['19', '0', '0', '0', '7', '0', '0', '0', '0', '19', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']>

## Training a model using the high-level API

In [7]:
crf = chaine.train(token_sequences, label_sequences, max_iterations=10)

[2020-12-01 22:42:51,829] [INFO] Loading data
[2020-12-01 22:42:55,196] [INFO] Processed sequences: 10000
[2020-12-01 22:42:58,442] [INFO] Processed sequences: 20000
[2020-12-01 22:42:59,757] [INFO] Start training
[2020-12-01 22:43:02,738] [INFO] Iteration: 1	Loss: 123113.674920
[2020-12-01 22:43:04,048] [INFO] Iteration: 2	Loss: 104047.092895
[2020-12-01 22:43:05,378] [INFO] Iteration: 3	Loss: 97503.745447
[2020-12-01 22:43:06,696] [INFO] Iteration: 4	Loss: 93356.046698
[2020-12-01 22:43:08,014] [INFO] Iteration: 5	Loss: 90501.864825
[2020-12-01 22:43:09,392] [INFO] Iteration: 6	Loss: 88175.746359
[2020-12-01 22:43:11,047] [INFO] Iteration: 7	Loss: 86161.561674
[2020-12-01 22:43:12,418] [INFO] Iteration: 8	Loss: 84571.676818
[2020-12-01 22:43:13,804] [INFO] Iteration: 9	Loss: 83597.010195
[2020-12-01 22:43:15,143] [INFO] Iteration: 10	Loss: 82401.065625


In [8]:
token_sequence = next(chaine.token_sequences(dataset["train"]["tokens"]))

In [9]:
crf.predict(token_sequence.featurize())

['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

## Custom features and lower-level training

In [10]:
import spacy
from chaine.data import Token, TokenSequence

In [11]:
help(TokenSequence.featurize)

Help on function featurize in module chaine.data:

featurize(self) -> Generator[List[str], NoneType, NoneType]
    Extract features from tokens of the sequence
    
    Note
    ----
    Overwrite this method for custom feature extraction – which is generally
    recommended as the default features may result in very low accuracy.
    
    One token is represented as a set of strings, each string is a unique feature,
    e.g. the string representation of the current token.



In [37]:
class CustomTokenSequence(TokenSequence):  
    def featurize(self):
        nlp = spacy.load("de_core_news_sm")
        nlp.tokenizer = nlp.tokenizer.tokens_from_list
        pos = [t.pos_ for s in nlp.pipe([[token.text for token in self.items]]) for t in s]   
        
        for token, tag in zip(self.items, pos):
            token.pos = tag
        
        for token in self.items:
            features = {
                f"token.lower()={token.lower()}",
                f"token.is_upper()={token.is_upper}",
                f"token.is_title()={token.is_title}",
                f"token.is_digit()={token.is_digit}",
                f"token.pos={token.pos}",
            }

            if token.index > 0:
                left_token = self.items[token.index - 1]
                features.update(
                    {
                        f"-1:token.lower()={left_token.lower()}",
                        f"-1:token.is_title()={left_token.is_title}",
                        f"-1:token.is_upper()={left_token.is_upper}",
                        f"-1:token.pos={left_token.pos}",
                    }
                )
            else:
                features.add("BOS=True")

            if token.index < max(self.indices):
                right_token = self.items[token.index + 1]
                features.update(
                    {
                        f"+1:token.lower()={right_token.lower()}",
                        f"+1:token.is_title()={right_token.is_title}",
                        f"+1:token.is_upper()={right_token.is_upper}",
                        f"+1:token.pos={right_token.pos}",
                    }
                )
            else:
                features.add("EOS=True")
            yield features

In [54]:
nlp = spacy.load("de_core_news_sm")
nlp.tokenizer = nlp.tokenizer.tokens_from_list

def featurize(sequence):
    pos = [t.pos_ for s in nlp.pipe([[token.text for token in self.items]]) for t in s]   

    for token, tag in zip(self.items, pos):
        token.pos = tag

[{'+1:token.is_title()=True',
  '+1:token.is_upper()=False',
  '+1:token.lower()=schwachstelle',
  '+1:token.pos=NOUN',
  'BOS=True',
  'token.is_digit()=False',
  'token.is_title()=True',
  'token.is_upper()=False',
  'token.lower()=eine',
  'token.pos=DET'},
 {'+1:token.is_title()=False',
  '+1:token.is_upper()=False',
  '+1:token.lower()=war',
  '+1:token.pos=AUX',
  '-1:token.is_title()=True',
  '-1:token.is_upper()=False',
  '-1:token.lower()=eine',
  '-1:token.pos=DET',
  'token.is_digit()=False',
  'token.is_title()=True',
  'token.is_upper()=False',
  'token.lower()=schwachstelle',
  'token.pos=NOUN'},
 {'+1:token.is_title()=False',
  '+1:token.is_upper()=False',
  '+1:token.lower()=beispielsweise',
  '+1:token.pos=ADV',
  '-1:token.is_title()=True',
  '-1:token.is_upper()=False',
  '-1:token.lower()=schwachstelle',
  '-1:token.pos=NOUN',
  'token.is_digit()=False',
  'token.is_title()=False',
  'token.is_upper()=False',
  'token.lower()=war',
  'token.pos=AUX'},
 {'+1:token.is

In [43]:
s = []

for sequence in [CustomTokenSequence(s.items).featurize() for s in token_sequences]:
    s.append(list(sequence.featurize()))

AttributeError: 'generator' object has no attribute 'featurize'

In [39]:
crf = chaine.train(token_sequences, label_sequences, max_iterations=100)

[2020-12-01 22:46:24,645] [INFO] Loading data


KeyboardInterrupt: 