## 🛑 Wait a second - maybe also look at the training
- My training notebook (containing equally many emojis) is here: I would love an upvote if you use the notebook or learned something new!
- https://www.kaggle.com/code/valentinwerner/915-deberta3base-training

## 🏟️ Credits (because this baseline did mostly already exist when I joined)

- Stride is something Raja first shared: https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/473011
- The biggest booster in performance is changing triplets to pairs in token reassembling: https://www.kaggle.com/code/nbroad/transformer-ner-baseline-lb-0-854/comments#2659393

In [1]:
INFERENCE_MAX_LENGTH=1024
STRIDE=384

# Note that training a model with stride, such as: https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b
# will also improve performance
model_path = "/kaggle/input/training-baseline/deberta3base_1024_with_paragraphs_augmentation_weighted_loss_with_nicholas"

In [2]:
import json
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset 
import numpy as np



## ♟️ Data Loading & Data Tokenization
- This tokenizer is actually special, comparing to usual NLP challenges
- inference tokenizer is a bit different than training tokenizer, because we don't have labels

In [3]:
def tokenize(example, tokenizer):
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH, stride=STRIDE, return_overflowing_tokens=True)
        
    return {
        **tokenized,
        "token_map": token_map,
    }

In [4]:
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

tokenizer = AutoTokenizer.from_pretrained(model_path)
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2)

   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

## 🏋🏻‍♀️ Trainer Class based on the trained model

In [5]:
model = AutoModelForTokenClassification.from_pretrained(model_path)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=1, 
    report_to="none",
)
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

### Striding functions

As using the stride give an overlap in tokens, these have to be removed (either pick one side of the stride or average them, ...).

In [6]:
def backwards_map_preds(sub_predictions, max_len):
    if max_len != 1: # nothing to map backwards if sequence is too short to be split in the first place
        if i == 0:
            # First sequence needs no SEP token (used to end a sequence)
            sub_predictions = sub_predictions[:,:-1,:]
        elif i == max_len-1:
            # End sequence needs to CLS token + Stride tokens 
            sub_predictions = sub_predictions[:,1+STRIDE:,:] # CLS tokens + Stride tokens
        else:
            # Middle sequence needs to CLS token + Stride tokens + SEP token
            sub_predictions = sub_predictions[:,1+STRIDE:-1,:]
    return sub_predictions

def backwards_map_(row_attribute, max_len):
    # Same logics as for backwards_map_preds - except lists instead of 3darray
    if max_len != 1:
        if i == 0:
            row_attribute = row_attribute[:-1]
        elif i == max_len-1:
            row_attribute = row_attribute[1+STRIDE:]
        else:
            row_attribute = row_attribute[1+STRIDE:-1]
    return row_attribute

In [7]:
%%time
preds = []
ds_dict = {
    "document":[],
    "token_map":[],
    "offset_mapping":[],
    "tokens":[]
}

for row in ds:
    # keys that need to be re-assembled
    row_preds = []
    row_offset = []

    for i, y in enumerate(row["offset_mapping"]):
        # create new datasset for each of of the splits per document
        x = Dataset.from_dict({
            "token_type_ids":[row["token_type_ids"][i]],
            "input_ids":[row["input_ids"][i]],
            "attention_mask":[row["attention_mask"][i]],
            "offset_mapping":[row["offset_mapping"][i]]
        })
        # predict for that split
        pred = trainer.predict(x).predictions
        # removing the stride and additional CLS & SEP that are created
        row_preds.append(backwards_map_preds(pred, len(row["offset_mapping"])))
        row_offset += backwards_map_(y, len(row["offset_mapping"]))
    
    # Finalize row
    ds_dict["document"].append(row["document"])
    ds_dict["tokens"].append(row["tokens"])
    ds_dict["token_map"].append(row["token_map"])
    ds_dict["offset_mapping"].append(row_offset)
    
    # Finalize prediction collection by concattenating
    p_concat = np.concatenate(row_preds, axis = 1)
    preds.append(p_concat)

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


CPU times: user 1.26 s, sys: 80.7 ms, total: 1.34 s
Wall time: 1.65 s


In [8]:
config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]

preds_final = []
for predictions in preds:
    predictions_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)
    predictions = predictions.argmax(-1)
    predictions_without_O = predictions_softmax[:,:,:12].argmax(-1)
    O_predictions = predictions_softmax[:,:,12]

    threshold = 0.9
    preds_final.append(np.where(O_predictions < threshold, predictions_without_O , predictions))

### Reassembling
Note that triplets was changed to pairs to remove the FN predictions created by ignoring new triplets

In [9]:
ds = Dataset.from_dict(ds_dict)
pairs = []
document, token, label, token_str = [], [], [], []
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
    for token_pred, (start_idx, end_idx) in zip(p[0], offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0: continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            pair=(doc, token_id)

            if pair not in pairs:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                pairs.append(pair)

## 🤝 Submission hand-in

In [10]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
df["row_id"] = list(range(len(df)))
display(df.head(100))

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [11]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)