In [1]:
import os

os.chdir("../")

# PII Evaluation Notebook

In [2]:
INFERENCE_MAX_LENGTH=1024
STRIDE=386
model_path = 'model_dir/DeBERTA-V3-base-1024-first'

In [3]:
import json
import pandas as pd
import time
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
)
from datasets import Dataset
import re
from spacy.lang.en import English

from src.data import tokenize_for_inference
from src.metric import evaluate

## Tokenization

In [4]:
data = pd.read_parquet("artifacts/raw_data:v0/raw_data.parquet")
data = data[data.valid == True]

In [5]:
ds = Dataset.from_dict(
    {
        "full_text": data["full_text"].tolist(),
        "document": data["document"].tolist(),
        "tokens": data["tokens"].tolist(),
        "trailing_whitespace": data["trailing_whitespace"].tolist(),
    }
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
ds = ds.map(
    tokenize_for_inference,
    fn_kwargs={
        "tokenizer": tokenizer,
        "max_length": INFERENCE_MAX_LENGTH,
        "stride": STRIDE,
    },
    num_proc=6,
)

Map (num_proc=6):   0%|          | 0/1146 [00:00<?, ? examples/s]

## Ensemble

Originally from: https://www.kaggle.com/code/wonbulzhang/0-962-piidd-let-s-go-higher

In [6]:
import gc
import torch
import numpy as np

from scipy.special import softmax

In [7]:
model_paths = {
    "model_dir/DeBERTA-V3-base-1024-first": 1 / 3,
    "model_dir/DeBERTA-V3-base-1024-middle": 1 / 3,
    "model_dir/DeBERTA-V3-base-1024-last": 1 / 3,
}

## Postprocessing

### Striding functions

As using the stride give an overlap in tokens, these have to be removed (either pick one side of the stride or average them, ...).

https://www.kaggle.com/code/valentinwerner/945-deberta-3-base-striding-inference/notebook

In [8]:
def backwards_map_preds(i, sub_predictions, max_len):
    # nothing to map backwards if sequence is too short to be split in the first place
    if max_len != 1:
        if i == 0:
            # First sequence needs no SEP token (used to end a sequence)
            sub_predictions = sub_predictions[:, :-1, :]
        elif i == max_len - 1:
            # End sequence needs to CLS token + Stride tokens
            sub_predictions = sub_predictions[
                :, 1 + STRIDE :, :
            ]  # CLS tokens + Stride tokens
        else:
            # Middle sequence needs to CLS token + Stride tokens + SEP token
            sub_predictions = sub_predictions[:, 1 + STRIDE : -1, :]

    return sub_predictions

In [9]:
def backwards_map_(i, row_attribute, max_len):
    # Same logics as for backwards_map_preds - except lists instead of 3darray
    if max_len != 1:
        if i == 0:
            row_attribute = row_attribute[:-1]
        elif i == max_len - 1:
            row_attribute = row_attribute[1 + STRIDE :]
        else:
            row_attribute = row_attribute[1 + STRIDE : -1]

    return row_attribute

In [10]:
offset_dict = {
    idx: [
        (offset_idx, offset_map, row)
        for offset_idx, offset_map in enumerate(row["offset_mapping"])
    ]
    for idx, row in enumerate(ds)
}

In [11]:
pred_dict = {}

for model_path, weight in model_paths.items():
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=1,
        report_to="none",
    )
    trainer = Trainer(
        model=model,
        args=args,
        data_collator=collator,
        tokenizer=tokenizer,
    )

    for idx, offset_list in offset_dict.items():
        for val in offset_list:
            offset_idx, _, row = val
            predictions = trainer.predict(
                Dataset.from_dict(
                    {
                        "token_type_ids": [row["token_type_ids"][offset_idx]],
                        "input_ids": [row["input_ids"][offset_idx]],
                        "attention_mask": [row["attention_mask"][offset_idx]],
                        "offset_mapping": [row["offset_mapping"][offset_idx]],
                    }
                )
            ).predictions
            weighted_predictions = softmax(predictions, axis=-1) * weight
            key = (idx, offset_idx)

            if key not in pred_dict.keys():
                pred_dict[key] = []

            pred_dict[key].append(weighted_predictions)

    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()
    time.sleep(2)

In [12]:
preds = [None for _ in range(len(offset_dict.items()))]
ds_dict = {"document": [None for _ in range(len(offset_dict.items()))], 
           "token_map": [None for _ in range(len(offset_dict.items()))], 
           "offset_mapping": [None for _ in range(len(offset_dict.items()))], 
           "tokens": [None for _ in range(len(offset_dict.items()))]}
total_weight = sum(model_paths.values())

for idx, offset_list in offset_dict.items():
    row_preds = [None for _ in range(len(offset_list))]
    row_offset = []

    for i, val in enumerate(offset_list):
        offset_idx, offset_map, row = val
        all_preds = pred_dict[(idx, offset_idx)]
        pred = np.sum(all_preds, axis=0) / total_weight

        # removing the stride and additional CLS & SEP that are created
        row_preds[i] = backwards_map_preds(offset_idx, pred, len(row["offset_mapping"]))
        row_offset += backwards_map_(offset_idx, offset_map, len(row["offset_mapping"]))

    # Finalize row
    ds_dict["document"][idx] = row["document"]
    ds_dict["tokens"][idx] = row["tokens"]
    ds_dict["token_map"][idx] = row["token_map"]
    ds_dict["offset_mapping"][idx] = row_offset

    # Finalize prediction collection by concattenating
    preds[idx] = np.concatenate(row_preds, axis=1)

In [13]:
config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]

preds_final = [None for _ in range(len(preds))]
for idx, p in enumerate(preds):
    predictions = p.argmax(-1)
    predictions_without_O = p[:, :, :12].argmax(-1)
    O_predictions = p[:, :, 12]

    threshold = 0.8
    preds_final[idx] = np.where(
        O_predictions < threshold, predictions_without_O, predictions
    )

## Reassembling

In [47]:
ds = Dataset.from_dict(ds_dict)
pairs = set()
document, token, label, token_str = [], [], [], []

for p, token_map, offsets, tokens, doc in zip(
    preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]
):
    for token_pred, (start_idx, end_idx) in zip(p[0], offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0:
            continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            pair = (doc, token_id)

            if pair not in pairs:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                pairs.add(pair)

In [48]:
print(len(document))

705


### Rule based Integrations

In [40]:
nlp = English()


def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue

        span.append(i)
        idx += 1
        
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue

    return spans

In [41]:
email_regex = re.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+")
phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")

for _, row in data.iterrows():
    # email
    for token_idx, t in enumerate(row["tokens"]):
        if re.fullmatch(email_regex, t) is not None:
            document.append(row["document"])
            token.append(token_idx)
            label.append("B-EMAIL")
            token_str.append(t)

    # phone number
    matches = phone_num_regex.findall(row["full_text"])

    if not matches:
        continue

    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, row["tokens"])

    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            document.append(row["document"])
            token.append(token_idx)
            label.append(f"{prefix}-PHONE_NUM")
            token_str.append(row["tokens"][token_idx])

In [42]:
print(len(document))

750


In [49]:
pred_df = pd.DataFrame(
    {"document": document, "token": token, "label": label, "token_str": token_str}
)
pred_df["row_id"] = list(range(len(pred_df)))
pred_df.shape

(705, 5)

In [50]:
from src.utils import get_reference_df

ref_df = get_reference_df(data)

In [51]:
eval_metrics = evaluate(pred_df, ref_df)

In [52]:
eval_metrics

{'ents_p': 0.6950354609929078,
 'ents_r': 0.98,
 'ents_f5': 0.9647860658841345,
 'ents_per_type_EMAIL_p': 0.8846153846153846,
 'ents_per_type_EMAIL_r': 1.0,
 'ents_per_type_EMAIL_f5': 0.9950083194675542,
 'ents_per_type_ID_NUM_p': 0.9666666666666667,
 'ents_per_type_ID_NUM_r': 0.9666666666666667,
 'ents_per_type_ID_NUM_f5': 0.9666666666666667,
 'ents_per_type_NAME_STUDENT_p': 0.6730769230769231,
 'ents_per_type_NAME_STUDENT_r': 0.9803921568627451,
 'ents_per_type_NAME_STUDENT_f5': 0.9634727368978295,
 'ents_per_type_PHONE_NUM_p': 0.5,
 'ents_per_type_PHONE_NUM_r': 1.0,
 'ents_per_type_PHONE_NUM_f5': 0.9629629629629629,
 'ents_per_type_STREET_ADDRESS_p': 0.8,
 'ents_per_type_STREET_ADDRESS_r': 0.9090909090909091,
 'ents_per_type_STREET_ADDRESS_f5': 0.9043478260869565,
 'ents_per_type_URL_PERSONAL_p': 0.7735849056603774,
 'ents_per_type_URL_PERSONAL_r': 1.0,
 'ents_per_type_URL_PERSONAL_f5': 0.9888682745825603,
 'ents_per_type_USERNAME_p': 0.6666666666666666,
 'ents_per_type_USERNAME_r':