In [1]:
import json
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np

## Load Dataset

In [2]:
data = json.load(open("../Dataset/data.json"))
len(data)

21672

In [3]:
train, valid = [], []

for row in data:
    if row["valid"]: valid.append(row)
    else: train.append(row)
        
print("Samples in training data:", len(train))
print("Samples in validation data:", len(valid))

Samples in training data: 20322
Samples in validation data: 1350


In [4]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

for id, label in id2label.items():
    print(id, label)

0 B-EMAIL
1 B-ID_NUM
2 B-NAME_STUDENT
3 B-PHONE_NUM
4 B-STREET_ADDRESS
5 B-URL_PERSONAL
6 B-USERNAME
7 I-ID_NUM
8 I-NAME_STUDENT
9 I-PHONE_NUM
10 I-STREET_ADDRESS
11 I-URL_PERSONAL
12 O


## Tokenize

In [5]:
def rebuild_from_example(example):
    text, labels, token_map = [], [], []

    for idx, (t, l, ws) in enumerate(zip(
        example["tokens"], example["labels"], example["trailing_whitespace"]
    )):
        text.append(t)
        token_map.extend([idx] * len(t))
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")
            token_map.append(-1)
            
    labels = np.array(labels)
    text = "".join(text)
    
    return text, labels, token_map

In [6]:
def tokenize(example, tokenizer, label2id, max_length):
    text, labels, token_map = rebuild_from_example(example)

    # actual tokenization
    tokenized = tokenizer(text, 
                          return_offsets_mapping=True, 
                          max_length=max_length, 
                          truncation=True)
    
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {
        **tokenized, 
        "labels": token_labels, 
        "length": length,
        "token_map": token_map
    }

In [7]:
def create_dict(data):
    keys = ["full_text", "document", "tokens", "trailing_whitespace", "labels", "token_indices"]
    
    # Initialize each key to have the same number of elements
    # as the number of rows in `data`
    output = {key: [None] * len(data) for key in keys}
    
    # Assign values to the dictionary
    for idx, row in enumerate(data):
        for key in keys:
            output[key][idx] = row[key]
    
    return output

In [8]:
INFERENCE_MAX_LENGTH = 2048
max_length = 1024
model_path = '../model_dir/deberta3base_1024'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

# train_ds = Dataset.from_dict(create_dict(train))
# train_ds = train_ds.map(tokenize, 
#                         fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": max_length}, 
#                         num_proc=3)

valid_ds = Dataset.from_dict(create_dict(valid))
valid_ds = valid_ds.map(tokenize, 
                        fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": max_length}, 
                        num_proc=3)

Map (num_proc=3):   0%|          | 0/1350 [00:00<?, ? examples/s]

### Compute Metrics

In [10]:
from typing import Dict

class PRFScore:
    """A precision / recall / F score."""

    def __init__(self, *, tp: int = 0, fp: int = 0, fn: int = 0) -> None:
        self.tp, self.fp, self.fn = tp, fp, fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p, r = self.precision, self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta, p, r = 5, self.precision, self.recall
        fbeta = (1 + (beta**2)) * p * r / ((beta**2) * p + r + 1e-100)
        return fbeta

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}

In [11]:
def parse_predictions(predictions, id2label, ds, threshold=0.9):
    pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
        predictions.shape[0], predictions.shape[1], 1
    )
    preds = predictions.argmax(-1)
    preds_without_O = pred_softmax[:, :, :12].argmax(-1)
    O_preds = pred_softmax[:, :, 12]
    preds_final = np.where(O_preds < threshold, preds_without_O, preds)
    
    pairs = []
    row, document, token, label, token_str = [], [], [], [], []
    for i, (p, token_map, offsets, tokens, doc, indices) in enumerate(
        zip(
            preds_final,
            ds["token_map"],
            ds["offset_mapping"],
            ds["tokens"],
            ds["document"],
            ds["token_indices"],
        )
    ):

        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[token_pred]

            if start_idx + end_idx == 0:
                continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map):
                break

            original_token_id = token_map[start_idx]
            token_id = indices[original_token_id]
            
            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                pair=(doc, token_id)

                if pair not in pairs:
                    row.append(i)
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[original_token_id])
                    pairs.append(pair)
                    
    df = pd.DataFrame(
        {
            "eval_row": row,
            "document": document,
            "token": token,
            "label": label,
            "token_str": token_str,
        }
    )

    df = df.drop_duplicates().reset_index(drop=True)

    df["row_id"] = list(range(len(df)))
    return df

In [12]:
from collections import defaultdict

def compute_metrics(p, id2label, valid_ds, valid_df, threshold=0.9):
    predictions, labels = p
    
    pred_df = parse_predictions(predictions, id2label, valid_ds, threshold=threshold)
    
    references = {
        (row.document, row.token, row.label) # TODO: Change to pair
        for row in valid_df.itertuples()
    }
    predictions = {
        (row.document, row.token, row.label) # TODO: Change to pair
        for row in pred_df.itertuples()
    }
    
    score_per_type = defaultdict(PRFScore)
    references = set(references)
    
    for ex in predictions:
        pred_type = ex[-1] # (Document, token, label)
        
        if pred_type != "O":
            pred_type = pred_type[2:] # Discard B- and I- prefix
            
        if pred_type not in score_per_type:
            score_per_type[pred_type] = PRFScore()
            
        if ex in references:
            score_per_type[pred_type].tp += 1
            references.remove(ex)
        else:
            score_per_type[pred_type].fp += 1
            
    for _, _, ref_type in references: # Remaining labels not predicted
        if pred_type != "O":
            pred_type = pred_type[2:] # Discard B- and I- prefix
        
        if pred_type not in score_per_type:
            score_per_type[pred_type] = PRFScore()
            
        score_per_type[pred_type].fn += 1
        
    totals = PRFScore()
    
    for prf in score_per_type.values():
        totals += prf
        
    results = {
        "ents_p": totals.precision,
        "ents_r": totals.recall,
        "ents_f5": totals.f5,
        "ents_per_type": {
            k: v.to_dict() for k, v in score_per_type.items() if k != "O"
        },
    }
    
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                if isinstance(v, dict):
                    for n2, v2 in v.items():
                        final_results[f"{key}_{n}_{n2}"] = v2
                else:
                    final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value

    return final_results

In [13]:
def get_reference_df(valid):
    raw_df = pd.DataFrame(valid)
    ref_df = raw_df[["document", "tokens", "labels"]].copy()
    ref_df = (
        ref_df.explode(["tokens", "labels"])
        .reset_index(drop=True)
        .rename(columns={"tokens": "token", "labels": "label"})
    )
    ref_df["token"] = ref_df.groupby("document").cumcount()

    reference_df = ref_df[ref_df["label"] != "O"].copy()
    reference_df = reference_df.reset_index().rename(columns={"index": "row_id"})
    reference_df = reference_df[["row_id", "document", "token", "label"]].copy()

    return reference_df


reference_df = get_reference_df(valid)

## Model Inferencing

In [14]:
model = AutoModelForTokenClassification.from_pretrained(model_path)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=1, 
    report_to="none",
)
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

In [15]:
preds = trainer.predict(valid_ds)

  0%|          | 0/20322 [00:00<?, ?it/s]

In [20]:
print("Computing final metrics...")
final_metrics = {
    f"final_f5_at_{threshold}": compute_metrics(
        (preds.predictions, None),
        id2label,
        valid_ds,
        reference_df,
        threshold=threshold,
    )["ents_f5"]
    for threshold in [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97]
}
print(final_metrics)

Computing final metrics...
{'final_f5_at_0.5': 0.8332093840849165, 'final_f5_at_0.6': 0.8329960879535949, 'final_f5_at_0.7': 0.8323917137476459, 'final_f5_at_0.8': 0.8295652173913043, 'final_f5_at_0.9': 0.8256430654998673, 'final_f5_at_0.95': 0.8159067085953878, 'final_f5_at_0.97': 0.8050420168067227}


In [21]:
best_threshold = float(max(final_metrics, key=final_metrics.get).split("_")[-1])
print(best_threshold)

0.5
