# Transformer NER baseline [lb 0.854]

The following is a basic script to train and run inference using `transformers` using 2x T4 GPUs. You might get better performance if you use a bigger model, or one that has already been trained on NER.

It includes processing to correctly map the given tokens with labels during training and vice versa when running inference.


Update: Thanks to @takanashihumbert, I switched `tokens` for `token_map` which helped improve the score from 0.569 to 0.854!


deberta-v3-small: lb 0.576  
deberta-v3-base: lb 0.569 (before update) --> 0.854 (after update) 

In [28]:
TRAINING = False # be sure to turn internet off if doing inference

TRAINING_MODEL_PATH = "microsoft/deberta-v3-large"
TRAINING_MAX_LENGTH = 512

INFERENCE_MODEL_PATH = "/kaggle/input/pii-data-detection-baseline/output/checkpoint-240"
INFERENCE_MAX_LENGTH = 2000

if TRAINING:
    !pip install seqeval evaluate -q
    !pip install -U datasets accelerate transformers -q

In [29]:
if TRAINING:
    import json

    data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/train.json"))

    print(len(data))
    print(data[0].keys())

    x = data[0]

    print(x["tokens"][:10])
    print(x["labels"][:10])
    print(x["trailing_whitespace"][:10])

In [30]:
if TRAINING:
    from itertools import chain

    all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
    label2id = {l: i for i,l in enumerate(all_labels)}
    id2label = {v:k for k,v in label2id.items()}

    id2label

In [31]:
if TRAINING:
    from transformers import AutoTokenizer
    import numpy as np

    tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

    def tokenize(example, tokenizer, label2id):
        text = []

        # these are at the character level
        labels = []

        for t, l, ws in zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"]):

            text.append(t)
            labels.extend([l]*len(t))

            # if there is trailing whitespace
            if ws:
                text.append(" ")
                labels.append("O")


        tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False)

        labels = np.array(labels)

        text = "".join(text)
        token_labels = []

        for start_idx, end_idx in tokenized.offset_mapping:

            # CLS token
            if start_idx + end_idx == 0: 
                token_labels.append(label2id["O"])
                continue

            # case when token starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1
            
            while start_idx >= len(labels):
                start_idx -= 1

            token_labels.append(label2id[labels[start_idx]])

        length = len(tokenized.input_ids)

        return {
            **tokenized,
            "labels": token_labels,
            "length": length
        }

In [32]:
if TRAINING:
    from datasets import Dataset

    ds = Dataset.from_dict({
        "full_text": [x["full_text"] for x in data],
        "document": [x["document"] for x in data],
        "tokens": [x["tokens"] for x in data],
        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
        "provided_labels": [x["labels"] for x in data],
    })

In [33]:
if TRAINING:
    ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id}, num_proc=2)

In [34]:
if TRAINING:
    # Confirm that alignment is good

    # run multiple times to see different rows
    x = ds.shuffle()[0]

    for t,l in zip(x["tokens"], x["provided_labels"]):
        if l != "O":
            print((t,l))

    print("*"*100)
            
    for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
        if id2label[l] != "O":
            print((t,id2label[l]))

## There are some long ones that will get truncated when training if you use a typical max_length

There might be some key labels that are at the end that are being missed.

In [35]:
if TRAINING:
    import matplotlib.pyplot as plt


    plt.hist(ds["length"], bins=100);

In [36]:
if TRAINING:
    import pandas as pd
    import plotly.express as px
    from collections import Counter
    
    
    group = []
    labels = []
    
    group_thresholds = [0, 50, 100, 200, 500, 1000, 2000, 10000]
    
    for sample_labels in ds["provided_labels"]:
        for i, label in enumerate(sample_labels):
            if label != "O":
                for j in range(1, len(group_thresholds)):
                    lower = group_thresholds[j-1]
                    upper = group_thresholds[j]
                    
                    if lower <= i < upper:
                        group.append(f"{lower}-{upper}")
                        labels.append(label)
                        break
    
    pairs = list(zip(labels, group))
    
    counts = Counter(pairs)
    
    
    data = {
        "label": [],
        "count": [],
        "range": [],
    }
    
    for (label, range_), count in counts.items():
        data["label"].append(label)
        data["range"].append(range_)
        data["count"].append(count)
    
                
    df = pd.DataFrame(data)
    
    
    px.scatter(df, x="range", y="count", color="label", log_y=True, height=1000)

In [37]:
%%writefile run.py

import os
import json
import argparse
import random
from itertools import chain
from functools import partial

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from tokenizers import AddedToken
import evaluate
from datasets import Dataset
import numpy as np

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/468844
def filter_no_pii(example, percent_allow=0.2):
    # Return True if there is PII
    # Or 20% of the time if there isn't
    
    has_pii = set("O") != set(example["provided_labels"])
    
    return has_pii or (random.random() < percent_allow)

def tokenize(example, tokenizer, label2id, max_length):
    text = []
    labels = []
    
    for t, l, ws in zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"]):
        
        text.append(t)
        labels.extend([l]*len(t))
        if ws:
            text.append(" ")
            labels.append("O")
    
    
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length)
    
    labels = np.array(labels)
    
    text = "".join(text)
    token_labels = []
    
    for start_idx, end_idx in tokenized.offset_mapping:
        
        # CLS token
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue
        
        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1
        
        while start_idx >= len(labels):
            start_idx -= 1
            
        token_labels.append(label2id[labels[start_idx]])
        
    length = len(tokenized.input_ids)
        
    return {
        **tokenized,
        "labels": token_labels,
        "length": length
    }
    
def compute_metrics(p, metric, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results   

def main():	20
    
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--model_path", type=str)
    parser.add_argument("--max_length", type=int)
    
    args = parser.parse_args()
    
    
    data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/train.json"))


    all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
    label2id = {l: i for i,l in enumerate(all_labels)}
    id2label = {v:k for k,v in label2id.items()}


    ds = Dataset.from_dict({
        "full_text": [x["full_text"] for x in data],
        "document": [x["document"] for x in data],
        "tokens": [x["tokens"] for x in data],
        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
        "provided_labels": [x["labels"] for x in data],b 0.576  
    })

    
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
    
    # lots of newlines in the text
    # adding this should be helpful
    tokenizer.add_tokens(AddedToken("\n", normalized=False))
    
    ds = ds.filter(
        filter_no_pii,
        num_proc=2,
    )
    
    ds = ds.map(
        tokenize, 
        fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": args.max_length}, 
        num_proc=2,
    )


    metric = evaluate.load("seqeval")


    model = AutoModelForTokenClassification.from_pretrained(args.model_path, num_labels=len(all_labels), id2label=id2label, label2id=label2id)
    model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=16)

    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

    args = TrainingArguments(
        "output", 
        fp16=True, 
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=4, 
        report_to="none",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        logging_steps=5,
        metric_for_best_model="overall_recall",
        greater_is_better=True,
        gradient_checkpointing=True,
        num_train_epochs=1
        dataloader_num_workers=1,
    )

    # may want to try to balance classes in splits
    final_ds = ds.train_test_split(test_size=0.2)


    trainer = Trainer(
        model=model, 
        args=args, 
        train_dataset=final_ds["train"], 
        eval_dataset=final_ds["test"], 
        data_collator=collator, 
        tokenizer=tokenizer,
        compute_metrics=partial(compute_metrics, metric=metric, all_labels=all_labels),
    )


    trainer.train()
    
    
if __name__ == "__main__":
    main()

Overwriting run.py


In [38]:
if TRAINING:
    # utilize both t4 gpus
    !accelerate launch --multi_gpu --num_processes 2 run.py \
      --model_path $TRAINING_MODEL_PATH \
      --max_length $TRAINING_MAX_LENGTH

In [39]:
%%writefile infer.py

import json
import argparse
from itertools import chain

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np

def tokenize(example, tokenizer, max_length):
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
            
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length)
    
        
    return {
        **tokenized,
        "token_map": token_map,
    }

def main():
    
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--model_path", type=str)
    parser.add_argument("--max_length", type=int)
    
    args = parser.parse_args()
    
    data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
    
    ds = Dataset.from_dict({
        "full_text": [x["full_text"] for x in data],
        "document": [x["document"] for x in data],
        "tokens": [x["tokens"] for x in data],
        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    })

    
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
    ds = ds.map(
        tokenize, 
        fn_kwargs={"tokenizer": tokenizer, "max_length": args.max_length}, 
        num_proc=2,
    )
    
    model = AutoModelForTokenClassification.from_pretrained(args.model_path)

    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

    args = TrainingArguments(
        ".", 
        per_device_eval_batch_size=4, 
        report_to="none",
    )
    
    trainer = Trainer(
        model=model, 
        args=args, 
        data_collator=collator, 
        tokenizer=tokenizer,
    )
    
    
    predictions = trainer.predict(ds).predictions

    ds.to_parquet("test_ds.pq")
    
    np.save("preds.npy", predictions)
    
    
if __name__ == "__main__":
    main()

Overwriting infer.py


In [40]:
if not TRAINING:
    
    !accelerate launch --num_processes 2 infer.py \
      --model_path $INFERENCE_MODEL_PATH \
      --max_length $INFERENCE_MAX_LENGTH

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map (num_proc=2):   0%|                           | 0/10 [00:00<?, ? examples/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy mor

### Recall is much more important than precision, so it might make sense to make predictions even if they aren't the highest score

In [41]:
if not TRAINING:
    
    import numpy as np
    import json
    from datasets import Dataset
    import pandas as pd
    from pathlib import Path

    config = json.load(open(Path(INFERENCE_MODEL_PATH) / "config.json"))

    id2label = config["id2label"]

    preds = np.load("preds.npy")

    ds = Dataset.from_parquet("test_ds.pq")

    preds = preds.argmax(-1)

    triplets = []
    document, token, label, token_str = [], [], [], []
    for p, token_map, offsets, tokens, doc in zip(preds, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[str(token_pred)]

            if start_idx + end_idx == 0: continue

            if token_map[start_idx] == -1: 
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): break

            token_id = token_map[start_idx]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.append(triplet)


    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

    df["row_id"] = list(range(len(df)))

    display(df.head(50))


    df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)


A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.3



Generating train split: 0 examples [00:00, ? examples/s]

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9
