In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

from collections import defaultdict
from typing import Dict
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import (
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification, 
    AutoTokenizer, 
    EarlyStoppingCallback, 
    set_seed
)
from seqeval.metrics import (
    recall_score, 
    precision_score, 
    classification_report, 
    f1_score
)
from scipy.special import softmax
import torch
from pathlib import Path
import wandb
import json
from tqdm import tqdm
import logging

from utils import visualize_ents, apply_threshold

random_seed = 42
set_seed(random_seed)

INFERENCE_MAX_LENGTH = 4000

## Load dataset

In [2]:
# Load dataset and convert ner_tags to labels

test_df = pd.read_json("/archive/savkin/parsed_datasets/NER/PII_Data_Detection/orig_test.json")

id2label = {0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}
label2id = {v:k for k,v in id2label.items()}
O_label_id = label2id['O']


dataset = Dataset.from_pandas(test_df)

## Tokenization pipeline

In [3]:
model_checkpoint = f"/archive/savkin/models/ner/PII Data Detection/deberta-base-4000"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


# Tokenize dataset and align labels with tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        max_length=INFERENCE_MAX_LENGTH, 
        is_split_into_words=True
    )

    all_word_ids = []
    for i in range(len(examples["tokens"])):
        word_ids = tokenized_inputs.word_ids(i)
        all_word_ids.append(word_ids)

    tokenized_inputs["word_ids"] = all_word_ids
    return tokenized_inputs

tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset.column_names
)

for row in tokenized_dataset:
    assert len(row["word_ids"]) == len(row["input_ids"])

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

# Inference

In [4]:

# Load model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    args = TrainingArguments(output_dir="tmp_trainer", report_to="none"),
    model=model,
    data_collator=data_collator
)

In [5]:
# Infer model

# best_threshold = model.config.best_threshold
best_threshold = 0.9

predictions = trainer.predict(tokenized_dataset)
pred_probas = softmax(predictions.predictions, axis=-1).max(-1)
pred_labels = apply_threshold(predictions.predictions, best_threshold, O_label_id)

## Convert to competition format 

In [7]:
submission = {
    "row_id": [],
    "document": [],
    "token": [],
    "label": [],
    # "word_str": [],
    "proba": []
}

for word_ids, row_id, document, words, p_labels, p_probas in zip(tokenized_dataset["word_ids"], 
                                                                test_df.index, 
                                                                test_df["document"], 
                                                                test_df["tokens"], 
                                                                pred_labels,
                                                                pred_probas):
    for subtoken_id, (label_id, proba) in enumerate(zip(p_labels[:len(word_ids)], p_probas[:len(word_ids)])):
        word_id = word_ids[subtoken_id]
        if label_id != -100 and label_id != O_label_id and word_id is not None:
            submission["row_id"].append(row_id)
            submission["document"].append(document)
            submission["token"].append(word_id)
            submission["label"].append(id2label[label_id])
            # submission["word_str"].append(words[word_id])
            submission["proba"].append(proba)


df = pd.DataFrame().from_dict(submission).drop_duplicates().sort_values(by=["document", "token"])
submission_df_subtoken_level = df[df["label"] != "O"].copy(deep=True)
submission_df_subtoken_level.head()

Unnamed: 0,row_id,document,token,label,proba
0,0,7,9,B-NAME_STUDENT,0.990726
1,0,7,10,I-NAME_STUDENT,0.964352
2,0,7,10,I-NAME_STUDENT,0.987722
3,0,7,482,B-NAME_STUDENT,0.98312
4,0,7,483,I-NAME_STUDENT,0.919441


In [8]:
def aggregate_subtokens(df, label_agg_type = "most_frequent", add_subtoken_info = False):
    df = df.reset_index()
    row = df.iloc[0]

    if add_subtoken_info:
        row["subtokens"] = df["subtoken_str"].agg(lambda x: x.tolist())
        row["probas"] = df["proba"].agg(lambda x: x.tolist())

    if label_agg_type == "most_frequent":
        row["label"] = df.groupby(["label"])["row_id"].count().sort_values().index[-1]
        row["agg_proba"] = df[df["label"] == row["label"]]["proba"].agg("mean")
    elif label_agg_type == "first":
        row["label"] = df["label"].agg(lambda x: x[0])
        row["agg_proba"] = df["proba"].agg(lambda x: x[0])
    elif label_agg_type == "max_proba":
        row["label"] = df.iloc[df["proba"].idxmax()]["label"]
        row["agg_proba"] = df["proba"].agg("max")
    
    return row

submission_df = submission_df_subtoken_level.groupby(["document", "token"]) \
                                            .apply(aggregate_subtokens, label_agg_type="most_frequent") \
                                            .reset_index(drop=True) \
                                            .drop(columns=["index", "proba", "agg_proba"])

submission_df.head()

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,0,7,10,I-NAME_STUDENT
2,0,7,482,B-NAME_STUDENT
3,0,7,483,I-NAME_STUDENT
4,0,7,741,B-NAME_STUDENT


In [9]:
submission_df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)