In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

from collections import defaultdict
from typing import Dict
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import (
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification, 
    AutoTokenizer, 
    EarlyStoppingCallback, 
    set_seed
)
from seqeval.metrics import (
    recall_score, 
    precision_score, 
    classification_report, 
    f1_score
)
from scipy.special import softmax
import torch
from tqdm import tqdm
from pathlib import Path
import wandb
import json
from IPython.core.display import display, HTML

import logging

import spacy
from spacy.tokens import Span, Doc
from spacy import displacy
from spacy.lang.en import English
import numpy as np

from utils import *
from colorama import Fore

SEQ_MAX_LENGTH = 4000

  from IPython.core.display import display, HTML


### Load dataset

In [9]:
id2label = {0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}
label2id = {v:k for k,v in id2label.items()}
O_label_id = label2id['O']


allowed_cols = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'valid']
df = pd.read_json("/archive/savkin/parsed_datasets/NER/PII_Data_Detection/orig_train_custom_split.json")[allowed_cols]
df["ner_tags"] = df["labels"].apply(lambda labels_list: [label2id[x] for x in labels_list])

train_df = df[df["valid"] == False].reset_index()
valid_df = df[df["valid"] == True].reset_index()

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "valid": Dataset.from_pandas(valid_df)
})

# Word-level tokenization

Let's check that word tokenization was done with __spacy English().tokenizer__

In [10]:
for _, row in tqdm(train_df.iterrows()):
    text = row.full_text
    spacy_tokens, spacy_whitespaces = tokenize_with_spacy(text)
    assert(row.tokens == spacy_tokens)
    assert(row.trailing_whitespace == spacy_whitespaces)

5117it [00:11, 461.32it/s]


Check that full text can be recovered from words and trailing whitespaces

In [11]:
for _, row in tqdm(train_df.iterrows()):
    new_text = "".join([token + (" " if whitespace else "") for token, whitespace in zip(row.tokens, row.trailing_whitespace)])
    assert(row.full_text == new_text)

5117it [00:00, 6527.50it/s]


Original NER tags are marked according to the BIO format:

_B-_ tag corresponds to the start word of entity

_I-_ tag corresponds to the innner words of entity

_O_ tag means that word doesn't belong to any entities

Let's have a look at example below:

In [12]:
example_text = "Hellow worldd Russia!"
example_text_tokens, example_text_trailing_whitespace = tokenize_with_spacy(example_text)
example_text_ents=["O", "B-STREET_ADDRESS", "I-STREET_ADDRESS", "O"]
example_text_labels = [12, 4, 10, 12]

print(f"Text: {example_text}")
print(f"Words: {example_text_tokens}")
print(f"NER tags: {example_text_ents}")
print(f"Trailing_whitespace: {example_text_trailing_whitespace}")

html = visualize_ents(example_text_tokens, example_text_trailing_whitespace, example_text_ents)
display(HTML(html))

Text: Hellow worldd Russia!
Words: ['Hellow', 'worldd', 'Russia', '!']
NER tags: ['O', 'B-STREET_ADDRESS', 'I-STREET_ADDRESS', 'O']
Trailing_whitespace: [True, True, False, False]


# Subtoken-level tokenization

Here we further split words into subtokens and align them with original NER tags

Let's have a look at example

In [13]:
# Load tokenizer
checkpoint_path = f"/archive/savkin/models/ner/PII Data Detection/deberta-base-4000"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# Expand word labels to tokens labels
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


inputs = tokenizer(example_text_tokens, is_split_into_words=True)
labels = example_text_labels
word_ids = inputs.word_ids()
subtokens = inputs.tokens()
token_labels = align_labels_with_tokens(labels, word_ids)


print("Words: ", example_text_tokens)
print("Subtokens: ", subtokens)
print("Mapping between words ans subtokens: ", word_ids)
print("\nNew NER tags: ")
for token, token_label in list(zip(subtokens, token_labels))[:20]:
    token_ner_tag = "SPECIAL TOKEN" if token_label == -100 else id2label[token_label]
    print(f"{token}\t-> {token_ner_tag} ")

Words:  ['Hellow', 'worldd', 'Russia', '!']
Subtokens:  ['[CLS]', '▁Hello', 'w', '▁world', 'd', '▁Russia', '▁!', '[SEP]']
Mapping between words ans subtokens:  [None, 0, 0, 1, 1, 2, 3, None]

New NER tags: 
[CLS]	-> SPECIAL TOKEN 
▁Hello	-> O 
w	-> O 
▁world	-> B-STREET_ADDRESS 
d	-> B-STREET_ADDRESS 
▁Russia	-> I-STREET_ADDRESS 
▁!	-> O 
[SEP]	-> SPECIAL TOKEN 


Now we can tokenize the whole dataset

In [14]:
# Tokenize dataset and align labels with tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        max_length=SEQ_MAX_LENGTH, 
        is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    subtoken2word = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
        subtoken2word.append(word_ids)

    tokenized_inputs["labels"] = new_labels
    tokenized_inputs["word_ids"] = subtoken2word
    return tokenized_inputs


tokenize_func = tokenize_and_align_labels

tokenized_dataset = dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=dataset["train"].column_names
)
tokenized_dataset["train"].features

Map:   0%|          | 0/5117 [00:00<?, ? examples/s]

Map:   0%|          | 0/1690 [00:00<?, ? examples/s]

{'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'word_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

# Model training and evaluation

Lets's start from looking at AutoModelForTokenClassification predictions

In [15]:
# Tokenize example text
example_df = pd.DataFrame.from_records([
    {
        "full_text": example_text,
        "tokens": example_text_tokens,
        "labels": example_text_ents,
        "trailing_whitespace": example_text_trailing_whitespace,
        "ner_tags": example_text_labels
    }]
)
example_dataset = Dataset.from_pandas(example_df)
tokenized_example = example_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=example_dataset.column_names
)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [16]:
# Create Trainer
model = AutoModelForTokenClassification.from_pretrained(checkpoint_path)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    args = TrainingArguments(output_dir="tmp_trainer"),
    model=model,
    data_collator=data_collator
)

In [17]:
# Get predictions
predictions = trainer.predict(tokenized_example)

true_labels = predictions.label_ids[0]
pred_labels = predictions.predictions.argmax(-1)[0]

print(Fore.GREEN + "Истинная разметка:", Fore.WHITE)
for token, true_l in list(zip(subtokens, true_labels))[:20]:
    token_ner_tag = "SPECIAL TOKEN" if true_l == -100 else id2label[true_l]
    print(f"{token}\t-> {token_ner_tag} ")

print()
print(Fore.YELLOW + "Предсказания модели", Fore.WHITE)
for token, pred_l, true_l in list(zip(subtokens, pred_labels, true_labels))[:20]:
    token_ner_tag = "SPECIAL TOKEN" if true_l == -100 else id2label[pred_l]
    print(f"{token}\t-> {token_ner_tag} ")

[32mИстинная разметка: [37m
[CLS]	-> SPECIAL TOKEN 
▁Hello	-> O 
w	-> O 
▁world	-> B-STREET_ADDRESS 
d	-> B-STREET_ADDRESS 
▁Russia	-> I-STREET_ADDRESS 
▁!	-> O 
[SEP]	-> SPECIAL TOKEN 

[33mПредсказания модели [37m
[CLS]	-> SPECIAL TOKEN 
▁Hello	-> O 
w	-> O 
▁world	-> O 
d	-> O 
▁Russia	-> O 
▁!	-> O 
[SEP]	-> SPECIAL TOKEN 


# Aggregation of subtoken-level predictions

Accumulate subtoken-level predictions into one dataframe

In [18]:
# Infer model
predictions = trainer.predict(tokenized_dataset["valid"])
true_labels = predictions.label_ids
pred_probas = softmax(predictions.predictions, axis=-1).max(-1)
pred_labels = predictions.predictions.argmax(-1)

In [20]:
submission = {
    "row_id": [],
    "document": [],
    "token": [],
    "label": [],
    "subtoken_str": [],
    "word_str": [],
    "proba": []
}

for input_ids, word_ids, row_id, document, words, p_labels, p_probas in zip(tokenized_dataset["valid"]["input_ids"], 
                                                                            tokenized_dataset["valid"]["word_ids"], 
                                                                            valid_df.index, 
                                                                            valid_df["document"], 
                                                                            valid_df["tokens"], 
                                                                            pred_labels, 
                                                                            pred_probas):
    subtokens = tokenizer.convert_ids_to_tokens(input_ids)
    for subtoken_id, (subtoken, label_id, proba) in enumerate(zip(subtokens, p_labels, p_probas)):
        word_id = word_ids[subtoken_id]
        if label_id != -100 and label_id != O_label_id and word_id is not None: # ignore O-labels
            submission["row_id"].append(row_id)
            submission["document"].append(document)
            submission["token"].append(word_id)
            submission["label"].append(id2label[label_id])
            submission["subtoken_str"].append(subtoken)
            submission["word_str"].append(words[word_id])
            submission["proba"].append(proba)

subtoken_df = pd.DataFrame().from_dict(submission).drop_duplicates().sort_values(by=["document", "token"])
subtoken_df.head()

Unnamed: 0,row_id,document,token,label,subtoken_str,word_str,proba
2528,917,7,9,B-NAME_STUDENT,▁Nathalie,Nathalie,0.990726
2529,917,7,10,I-NAME_STUDENT,▁S,Sylla,0.964352
2530,917,7,10,I-NAME_STUDENT,ylla,Sylla,0.987722
2531,917,7,482,B-NAME_STUDENT,▁Nathalie,Nathalie,0.98312
2532,917,7,483,I-NAME_STUDENT,▁S,Sylla,0.919441


Now that we have subtoken-level predictions and word mapping, we can merge subtoken NER tags into word-level tags

In [22]:
def aggregate_subtokens(df, label_agg_type = "most_frequent", add_subtoken_info = False):
    df = df.reset_index()
    row = df.iloc[0]

    if add_subtoken_info:
        row["subtokens"] = df["subtoken_str"].agg(lambda x: x.tolist())
        row["probas"] = df["proba"].agg(lambda x: x.tolist())

    if label_agg_type == "most_frequent":
        row["label"] = df.groupby(["label"])["row_id"].count().sort_values().index[-1]
        row["agg_proba"] = df[df["label"] == row["label"]]["proba"].agg("mean")
    elif label_agg_type == "first":
        row["label"] = df["label"].agg(lambda x: x[0])
        row["agg_proba"] = df["proba"].agg(lambda x: x[0])
    elif label_agg_type == "max_proba":
        row["label"] = df.iloc[df["proba"].idxmax()]["label"]
        row["agg_proba"] = df["proba"].agg("max")
    
    return row

word_df = subtoken_df.groupby(["document", "token"]) \
                     .apply(aggregate_subtokens, add_subtoken_info=True) \
                     .reset_index(drop=True) \
                     .drop(columns=["index", "subtoken_str", "proba"])
word_df.head()

Unnamed: 0,row_id,document,token,label,word_str,subtokens,probas,agg_proba
0,917,7,9,B-NAME_STUDENT,Nathalie,[▁Nathalie],[0.9907263517379761],0.990726
1,917,7,10,I-NAME_STUDENT,Sylla,"[▁S, ylla]","[0.9643515944480896, 0.9877223968505859]",0.976037
2,917,7,482,B-NAME_STUDENT,Nathalie,[▁Nathalie],[0.9831204414367676],0.98312
3,917,7,483,I-NAME_STUDENT,Sylla,"[▁S, ylla]","[0.9194409847259521, 0.9561810493469238]",0.937811
4,917,7,741,B-NAME_STUDENT,Nathalie,[▁Nathalie],[0.9903643727302551],0.990364


# Word-level Metrics

Before computing the metrics, we need to add "O" labels to our predictions

In [38]:
true_labels = valid_df["labels"].apply(lambda labels: [label2id[l] for l in labels]).tolist()

# Create a template filled with "O" label
pred_labels = valid_df["labels"].agg(lambda x: [O_label_id for _ in x]).tolist()

# Group words into documents and reorder documents according to validation dataset
original_document_order = valid_df["document"].tolist()
predicted_ents = word_df[["document", "token", "label"]].groupby("document").agg(list).reindex(original_document_order, fill_value=[])
 
# Add predictions to the template
for i, (_, row) in enumerate(predicted_ents.iterrows()):
    if len(row["token"]) > 0:
        for token_id, l in zip(row["token"], row["label"]):
            pred_labels[i][token_id] = label2id[l]

In [37]:
metric = evaluate.load("seqeval")

def f5_score(precision, recall):
    return (1 + 5*5) * recall * precision / (5*5*precision + recall + 1e-100)

def compute_metrics_from_labels(predictions, labels):
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    for label, scores in results.items():
        if "overall" not in label:
            precision = scores["precision"]
            recall = scores["recall"]
            results[label]["f5_score"] = f5_score(precision, recall)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    results["overall_f5_score"] = f5_score(precision, recall)

    return results

In [39]:
compute_metrics_from_labels(true_labels, pred_labels)

  _warn_prf(average, modifier, msg_start, len(result))


{'EMAIL': {'precision': 0.6538461538461539,
  'recall': 0.8947368421052632,
  'f1': 0.7555555555555555,
  'number': 19,
  'f5_score': 0.8822355289421158},
 'ID_NUM': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 0,
  'f5_score': 0.0},
 'NAME_STUDENT': {'precision': 0.9324324324324325,
  'recall': 0.7095115681233933,
  'f1': 0.8058394160583943,
  'number': 778,
  'f5_score': 0.7160961979842332},
 'PHONE_NUM': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 0,
  'f5_score': 0.0},
 'STREET_ADDRESS': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 0,
  'f5_score': 0.0},
 'URL_PERSONAL': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 0,
  'f5_score': 0.0},
 'USERNAME': {'precision': 0.3333333333333333,
  'recall': 0.009302325581395349,
  'f1': 0.018099547511312215,
  'number': 215,
  'f5_score': 0.009663631295298274},
 'overall_precision': 0.7758152173913043,
 'overall_recall': 0.5642292490118577,
 'overall_f1': 0.6533180778032036,
 'ov