# PII Inference Notebook

In [1]:
INFERENCE_MAX_LENGTH=512
STRIDE=192
model_path = '/kaggle/input/pii-deberta-models/DeBERTA-V3-base-512-first'

In [2]:
import json
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np
from pii_data_utility_script import tokenize_for_inference

2024-03-05 21:21:03.806052: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-05 21:21:03.806174: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-05 21:21:03.939726: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Tokenization

In [3]:
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
len(data)

10

In [4]:
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

tokenizer = AutoTokenizer.from_pretrained(model_path)
ds = ds.map(tokenize_for_inference, fn_kwargs={"tokenizer": tokenizer, "max_length": INFERENCE_MAX_LENGTH, "stride": STRIDE}, num_proc=2)

   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

## Ensemble

Originally from: https://www.kaggle.com/code/wonbulzhang/0-962-piidd-let-s-go-higher

In [5]:
import gc
import torch
import numpy as np
import time

from scipy.special import softmax

In [6]:
model_paths = {
    "/kaggle/input/pii-deberta-models/DeBERTA-V3-base-512-first": 3/5,
    "/kaggle/input/pii-deberta-models/DeBERTA-V3-base-512-middle": 1/5,
    "/kaggle/input/pii-deberta-models/DeBERTA-V3-base-512-last": 1/5,
}

## Postprocessing

### Striding functions

As using the stride give an overlap in tokens, these have to be removed (either pick one side of the stride or average them, ...).

https://www.kaggle.com/code/valentinwerner/945-deberta-3-base-striding-inference/notebook

In [7]:
def backwards_map_preds(i, sub_predictions, max_len):
    # nothing to map backwards if sequence is too short to be split in the first place
    if max_len != 1:
        if i == 0:
            # First sequence needs no SEP token (used to end a sequence)
            sub_predictions = sub_predictions[:, :-1, :]
        elif i == max_len - 1:
            # End sequence needs to CLS token + Stride tokens
            sub_predictions = sub_predictions[
                :, 1 + STRIDE :, :
            ]  # CLS tokens + Stride tokens
        else:
            # Middle sequence needs to CLS token + Stride tokens + SEP token
            sub_predictions = sub_predictions[:, 1 + STRIDE : -1, :]

    return sub_predictions

In [8]:
def backwards_map_(i, row_attribute, max_len):
    # Same logics as for backwards_map_preds - except lists instead of 3darray
    if max_len != 1:
        if i == 0:
            row_attribute = row_attribute[:-1]
        elif i == max_len - 1:
            row_attribute = row_attribute[1 + STRIDE :]
        else:
            row_attribute = row_attribute[1 + STRIDE : -1]

    return row_attribute

In [9]:
offset_dict = {
    idx: [
        (offset_idx, offset_map, row)
        for offset_idx, offset_map in enumerate(row["offset_mapping"])
    ]
    for idx, row in enumerate(ds)
}

In [10]:
pred_dict = {}

for model_path, weight in model_paths.items():
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=1,
        report_to="none",
    )
    trainer = Trainer(
        model=model,
        args=args,
        data_collator=collator,
        tokenizer=tokenizer,
    )

    for idx, offset_list in offset_dict.items():
        for val in offset_list:
            offset_idx, _, row = val
            predictions = trainer.predict(
                Dataset.from_dict(
                    {
                        "token_type_ids": [row["token_type_ids"][offset_idx]],
                        "input_ids": [row["input_ids"][offset_idx]],
                        "attention_mask": [row["attention_mask"][offset_idx]],
                        "offset_mapping": [row["offset_mapping"][offset_idx]],
                    }
                )
            ).predictions
            weighted_predictions = softmax(predictions, axis=-1) * weight
            key = (idx, offset_idx)

            if key not in pred_dict.keys():
                pred_dict[key] = []

            pred_dict[key].append(weighted_predictions)

    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

In [11]:
preds = [None for _ in range(len(offset_dict.items()))]
ds_dict = {"document": [None for _ in range(len(offset_dict.items()))], 
           "token_map": [None for _ in range(len(offset_dict.items()))], 
           "offset_mapping": [None for _ in range(len(offset_dict.items()))], 
           "tokens": [None for _ in range(len(offset_dict.items()))]}
total_weight = sum(model_paths.values())

for idx, offset_list in offset_dict.items():
    row_preds = [None for _ in range(len(offset_list))]
    row_offset = []

    for i, val in enumerate(offset_list):
        offset_idx, offset_map, row = val
        all_preds = pred_dict[(idx, offset_idx)]
        pred = np.sum(all_preds, axis=0) / total_weight

        # removing the stride and additional CLS & SEP that are created
        row_preds[i] = backwards_map_preds(offset_idx, pred, len(row["offset_mapping"]))
        row_offset += backwards_map_(offset_idx, offset_map, len(row["offset_mapping"]))

    # Finalize row
    ds_dict["document"][idx] = row["document"]
    ds_dict["tokens"][idx] = row["tokens"]
    ds_dict["token_map"][idx] = row["token_map"]
    ds_dict["offset_mapping"][idx] = row_offset

    # Finalize prediction collection by concattenating
    preds[idx] = np.concatenate(row_preds, axis=1)

In [12]:
config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]

preds_final = [None for _ in range(len(preds))]
for idx, p in enumerate(preds):
    predictions = p.argmax(-1)
    predictions_without_O = p[:,:,:12].argmax(-1)
    O_predictions = p[:,:,12]

    threshold = 0.97
    preds_final[idx] = np.where(O_predictions < threshold, predictions_without_O , predictions)

## Reassembling

In [13]:
ds = Dataset.from_dict(ds_dict)
pairs = set()
document, token, label, token_str = [], [], [], []

for p, token_map, offsets, tokens, doc in zip(
    preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]
):
    for token_pred, (start_idx, end_idx) in zip(p[0], offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0:
            continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            pair = (doc, token_id)

            if pair not in pairs:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                pairs.add(pair)

In [14]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
df["row_id"] = list(range(len(df)))
display(df.head(100))

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [15]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)