## GermEval 2014 (German NER) 数据预处理

处理数据

In [None]:
import sys
import csv
from transformers import BertTokenizer

In [59]:
train_file = './data/NER-de-train.tsv'
dev_file = './data/NER-de-dev.tsv'
test_file = './data/NER-de-test.tsv'
raw_files = ['./data/NER-de-train.tsv', './data/NER-de-dev.tsv', './data/NER-de-test.tsv']
save_files = ['./data/train.txt', './data/dev.txt', './data/test.txt']
label_file = './data/labels.txt'

In [None]:
model_name_or_path = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)

原始数据有四列 (使用 tab 分割)，预处理程序只是处理两个相关的列 (token and outer span NER annotation)。

```shell
curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
```


In [None]:
def extrac_columns(fp):
    lines = []
    with open(fp, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('#'):
                continue
            if line == '\n':
                lines.append(line)
            else:
                items = line.split('\t')
                if len(items) == 4:
                    lines.append('{} {}'.format(items[1], items[2]))
                else:
                    continue
    return lines


GermEval 2014 数据集包含一些特殊的控制字符： `'\x96', '\u200e', '\x95', '\xad' or '\x80'`。这些 tokens 在 `BertTokenizer` 中不存在，会返回空 token ，使得与输入 `InputExample` 不一致。

因此需要过滤这些 tokens ，将长句子分割成短句子。


```shell
wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
```

定义变量:

```shell
export MAX_LENGTH=128
export BERT_MODEL=bert-base-multilingual-cased

python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
```


In [None]:
def clean_fun(lines, max_len=128):
    text = []
    subword_len_counter = 0  # 用于标记句子的长度
    for line in lines:
        line = line.rstrip()
        # 如果为空行，初始化句子长度为 0
        if not line:
            text.append(line)
            subword_len_counter = 0
            continue

        token = line.split()[0]
        current_subwords_len = len(tokenizer.tokenize(token))

        # Token contains strange control characters like \x96 or \x95
        # Just filter out the complete line
        if current_subwords_len == 0:
            continue

        if (subword_len_counter + current_subwords_len) > max_len:
            text.append("")
            text.append(line)
            subword_len_counter = 0
            continue

        subword_len_counter += current_subwords_len

        text.append(line)
    return text


In [None]:
def save_txt(fp, text):
    with open(fp, 'w', encoding='utf-8') as f:
        f.write('\n'.join(text))


In [None]:
for rfp, sfp in zip(raw_files, save_files):
    print(rfp, sfp)
    lines = extrac_columns(rfp)
    text = clean_fun(lines, max_len=128)
    save_txt(sfp, text)

获取标签

In [60]:
def extrac_labels(fps, label_fp):
    labels = set()
    for fp in fps:
        with open(fp, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.rstrip()
                # 如果为空行，初始化句子长度为 0
                if line:
                    label = line.split()[1]
                    labels.add(label)
    with open(label_fp, 'w', encoding='utf-8') as f:
        f.write('\n'.join(labels))
        

In [61]:
extrac_labels(save_files, label_file)