In [1]:
import os
os.environ["WANDB_PROJECT"] = "PII Data Detection"
os.environ["WANDB_LOG_MODEL"] = "true"
os.environ["WANDB_WATCH"] = "all"
os.environ["CUDA_VISIBLE_DEVICES"]=""
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

In [2]:
from collections import defaultdict
from typing import Dict
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import (
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification, 
    AutoTokenizer, 
    EarlyStoppingCallback, 
    set_seed
)
from seqeval.metrics import (
    recall_score, 
    precision_score, 
    classification_report, 
    f1_score
)
from scipy.special import softmax
import torch
from pathlib import Path
import wandb
import json
from tqdm import tqdm
import logging

from utils import visualize_ents, apply_threshold

random_seed = 42
set_seed(random_seed)

INFERENCE_MAX_LENGTH = 4000
wandb_run_name = f"deberta-base-{INFERENCE_MAX_LENGTH}-crf"
# wandb_run_name = "deleteme"
model_save_path = f"/archive/savkin/models/ner/PII Data Detection/{wandb_run_name}"

  from .autonotebook import tqdm as notebook_tqdm


# Load dataset

In [3]:
# Load dataset and convert ner_tags to labels
allowed_cols = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'valid']

df = pd.read_json("/archive/savkin/parsed_datasets/NER/PII_Data_Detection/orig_train_custom_split.json")[allowed_cols]

id2label = {0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}
label2id = {v:k for k,v in id2label.items()}
O_label_id = label2id['O']


df["ner_tags"] = df["labels"].apply(lambda labels_list: [label2id[x] for x in labels_list])
df["has_ents"] = df['labels'].apply(lambda labels: len(set(labels)) > 1)

train_df = df[df["valid"] == False].reset_index()
valid_df = df[df["valid"] == True].reset_index()

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "valid": Dataset.from_pandas(valid_df)
})

# Tokenize data

In [4]:
# Load model and tokenizer
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [5]:
# Expand word labels to tokens labels
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

# Tokenize dataset and align labels with tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        max_length=INFERENCE_MAX_LENGTH, 
        is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    subtoken2word = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
        subtoken2word.append(word_ids)

    tokenized_inputs["labels"] = new_labels
    tokenized_inputs["word_ids"] = subtoken2word
    return tokenized_inputs

tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names
)
tokenized_dataset["train"].features

Map: 100%|██████████| 5117/5117 [00:10<00:00, 499.13 examples/s]
Map: 100%|██████████| 1690/1690 [00:03<00:00, 472.72 examples/s]


{'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'word_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

# Train model

In [6]:
seqeval_metrics = evaluate.load("seqeval")

def f5_score(precision, recall):
    return (1 + 5*5) * recall * precision / (5*5*precision + recall + 1e-100)

def compute_metrics_from_labels(predictions, labels):
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval_metrics.compute(predictions=true_predictions, references=true_labels)
    for label, scores in results.items():
        if "overall" not in label:
            precision = scores["precision"]
            recall = scores["recall"]
            results[label]["f5_score"] = f5_score(precision, recall)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    results["overall_f5_score"] = f5_score(precision, recall)

    return results


def compute_metrics(eval_preds):
    logits, labels = eval_preds

    predictions = np.argmax(logits, axis=-1)

    return compute_metrics_from_labels(predictions, labels)

In [7]:
def compute_metrics_crf(eval_preds):
    tags, labels = eval_preds
    return compute_metrics_from_labels(tags, labels)

In [8]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, pad_to_multiple_of=16)

In [9]:
import sys
sys.path.append('..')
from src.deberta_crf import DebertaV2WithCRF
from src.deberta_lstm_crf import DebertaV2WithLSTMCRF

In [10]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [11]:
model = DebertaV2WithCRF.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
    # output_hidden_states=True,
)

Some weights of DebertaV2WithCRF were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'crf.end_transitions', 'crf.start_transitions', 'crf.transitions']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model.config.pruned_heads

{}

In [None]:
model.config.

In [12]:
training_args = TrainingArguments(
    output_dir="training_logs",
    learning_rate=2e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=5,
    # num_train_epochs=1,
    # max_steps=400,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    warmup_steps=600,
    eval_steps=200,
    save_steps=200,
    logging_steps=200,
    save_total_limit=1,
    metric_for_best_model="overall_f5_score",
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="wandb",
    # report_to="none",
    run_name=wandb_run_name
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(4)]
)

In [13]:
train_loader = trainer.get_train_dataloader()

In [14]:
# trainer.train()
# wandb.finish()

In [15]:
for batch in train_loader:
    break

In [16]:
batch.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [28]:
model.to('cuda')
print()




In [17]:
batch['labels']

tensor([[-100,   12,   12,  ..., -100, -100, -100],
        [-100,   12,   12,  ..., -100, -100, -100],
        [-100,   12,   12,  ..., -100, -100, -100]], device='cuda:0')

In [22]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f9d4ad6d300>

In [23]:
out = model(**batch)

In [24]:
out.loss

tensor(3.0366, device='cuda:0', grad_fn=<NegBackward0>)

In [25]:
out.loss.backward()

  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
    self._run_once()
  File "/home

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.LongTensor [3]] is at version 4; expected version 3 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

In [37]:
compute_metrics_crf((predictions, gt_labels))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'NAME_STUDENT': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 0,
  'f5_score': 0.0},
 'PHONE_NUM': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 0,
  'f5_score': 0.0},
 'URL_PERSONAL': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 0,
  'f5_score': 0.0},
 'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.004709576138147566,
 'overall_f5_score': 0.0}

In [62]:
true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
true_predictions = [
    [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

KeyError: tensor(12, device='cuda:0')

In [63]:
compute_metrics_from_labels(predictions, gt_labels)

  _warn_prf(average, modifier, msg_start, len(result))


{'STREET_ADDRESS': {'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0,
  'number': 0,
  'f5_score': 0.0},
 'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.5007849293563579,
 'overall_f5_score': 0.0}

In [36]:
compute_metrics_from_labels(tags.detach().cpu().numpy(), labels.detach().cpu().numpy())

KeyError: -100

In [34]:
_, seq_length = attention_mask.shape
padded_tags = [tag + [-100] * (seq_length - len(tag)) for tag in tags]
tags = torch.tensor(padded_tags, dtype=torch.long, device=logits.device)

In [32]:
logits = logits.transpose(0, 1)
labels = labels.transpose(0, 1)
attention_mask = attention_mask.transpose(0, 1)

In [56]:
model.to('cuda')
print()




In [65]:
model.crf.reset_parameters()

In [72]:
numerator = model.crf._compute_score(logits, labels, attention_mask)

In [73]:
numerator

tensor([-123.5187, -248.7553, -219.1508], device='cuda:0',
       grad_fn=<AddBackward0>)

In [78]:
numerator - denominator

tensor([ -935.3416, -1499.7466, -1464.5309], device='cuda:0',
       grad_fn=<SubBackward0>)

In [79]:
nn.Parameter()

NameError: name 'nn' is not defined

In [69]:
model.crf.start_transitions

Parameter containing:
tensor([ 0.0226, -0.0980, -0.0203, -0.0919, -0.0687, -0.0035,  0.0472, -0.0188,
         0.0038, -0.0427, -0.0517,  0.0846,  0.0660], device='cuda:0',
       requires_grad=True)

In [77]:
model.crf.transitions

Parameter containing:
tensor([[ 0.0652, -0.0574,  0.0458, -0.0358,  0.0510, -0.0574,  0.0856, -0.0712,
         -0.0306,  0.0995,  0.0106, -0.0502,  0.0860],
        [-0.0442,  0.0685,  0.0043, -0.0723,  0.0391,  0.0184,  0.0741, -0.0183,
          0.0558, -0.0248, -0.0666, -0.0331, -0.0162],
        [-0.0773,  0.0978,  0.0267,  0.0265,  0.0163,  0.0545, -0.0925, -0.0921,
         -0.0716,  0.0433,  0.0006,  0.0923,  0.0432],
        [ 0.0513,  0.0911,  0.0954, -0.0634, -0.0694,  0.0553,  0.0584, -0.0790,
          0.0680,  0.0627,  0.0509,  0.0822,  0.0543],
        [-0.0357, -0.0891,  0.0008,  0.0276, -0.0801, -0.0591,  0.0387,  0.0609,
          0.0905,  0.0626,  0.0740, -0.0273, -0.0541],
        [-0.0288, -0.0100, -0.0803, -0.0369,  0.0501,  0.0407, -0.0342,  0.0828,
         -0.0219, -0.0649,  0.0956, -0.0256, -0.0486],
        [ 0.0837,  0.0973,  0.0968,  0.0473, -0.0074,  0.0996, -0.0877,  0.0808,
         -0.0656, -0.0989,  0.0641, -0.0122, -0.0941],
        [-0.0482, -0.0323,

In [71]:
model.crf.end_transitions

Parameter containing:
tensor([ 0.0975, -0.0742,  0.0124,  0.0044,  0.0489,  0.0191,  0.0929,  0.0796,
         0.0546,  0.0336,  0.0092,  0.0014,  0.0176], device='cuda:0',
       requires_grad=True)

In [74]:
denominator =  model.crf._compute_normalizer(logits, attention_mask)

In [76]:
denominator

tensor([ 811.8228, 1250.9913, 1245.3801], device='cuda:0',
       grad_fn=<LogsumexpBackward0>)

In [40]:
seq_length = logits.size(0)

In [41]:
seq_length

495

In [42]:
score = model.crf.start_transitions + logits[0]

In [43]:
score

tensor([[ 0.3755,  0.9901, -0.5380, -1.1875, -0.5367, -0.2067, -0.7468,  0.1100,
         -0.5505,  0.3479, -1.5259,  1.0269, -1.2292],
        [ 0.0536,  1.5502, -1.5227, -0.1975, -0.3847, -0.8583, -2.0062, -1.1175,
         -0.2108,  0.6065, -0.4465, -0.1730, -1.1032],
        [ 0.3711,  3.0916, -0.1467, -0.3943,  0.4998,  0.5422, -1.5350, -1.0084,
         -0.5701,  0.6391, -1.5058, -0.5883,  0.1358]], device='cuda:0',
       grad_fn=<AddBackward0>)

In [44]:
# Broadcast score for every possible next tag
# shape: (batch_size, num_tags, 1)
broadcast_score = score.unsqueeze(2)

In [46]:
i = 1

In [45]:
broadcast_score.shape

torch.Size([3, 13, 1])

In [47]:
broadcast_emissions = logits[i].unsqueeze(1)

In [48]:
broadcast_emissions.shape

torch.Size([3, 1, 13])

In [54]:
model.crf.transitions.shape

torch.Size([13, 13])

In [49]:
next_score = broadcast_score + model.crf.transitions + broadcast_emissions

In [50]:
next_score

tensor([[[-2.6852e+35,  1.4958e+00, -2.6852e+35, -1.0562e-01, -6.6666e-01,
           1.8607e-01,  1.3747e-01,  9.5161e-02, -3.7512e-01,  2.2635e-01,
           4.8170e-01,  1.8281e-01, -1.0038e+00],
         [ 3.6386e-01,  2.1104e+00,  3.5200e-01,  5.0895e-01, -5.2092e-02,
           8.0064e-01,  7.5204e-01,  7.0973e-01,  2.3945e-01,  8.4092e-01,
           1.0963e+00,  7.9738e-01, -3.8921e-01],
         [-1.1643e+00,  5.8223e-01, -1.1762e+00, -1.0192e+00, -1.5803e+00,
          -7.2753e-01, -7.7612e-01, -8.1843e-01, -1.2887e+00, -6.8724e-01,
                  nan,         nan,         nan],
         [        nan, -6.7241e-02, -1.8256e+00, -1.6687e+00, -2.2297e+00,
          -1.3770e+00, -1.4256e+00, -1.4679e+00, -1.9382e+00, -1.3367e+00,
          -1.0814e+00, -1.3803e+00, -2.5668e+00],
         [-1.1629e+00,  5.8359e-01, -1.1748e+00, -1.0179e+00, -1.5789e+00,
          -7.2616e-01, -7.7476e-01, -8.1707e-01, -1.2874e+00, -6.8588e-01,
          -4.3053e-01, -7.2942e-01, -1.9160e+00],


In [None]:
for i in range(1, seq_length):
            # Broadcast score for every possible next tag
            # shape: (batch_size, num_tags, 1)
            broadcast_score = score.unsqueeze(2)

            # Broadcast emission score for every possible current tag
            # shape: (batch_size, 1, num_tags)
            broadcast_emissions = emissions[i].unsqueeze(1)

            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
            # for each sample, entry at row i and column j stores the sum of scores of all
            # possible tag sequences so far that end with transitioning from tag i to tag j
            # and emitting
            # shape: (batch_size, num_tags, num_tags)
            next_score = broadcast_score + self.transitions + broadcast_emissions

            # Sum over all possible current tags, but we're in score space, so a sum
            # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
            # all possible tag sequences so far, that end in tag i
            # shape: (batch_size, num_tags)
            next_score = torch.logsumexp(next_score, dim=1)

            # Set score to the next score if this timestep is valid (mask == 1)
            # shape: (batch_size, num_tags)
            score = torch.where(mask[i].unsqueeze(1), next_score, score)

        # End transition score
        # shape: (batch_size, num_tags)
        score += self.end_transitions

        # Sum (log-sum-exp) over all possible tags
        # shape: (batch_size,)
        return torch.logsumexp(score, dim=1)

In [None]:

        for i in range(1, seq_length):
            # Broadcast score for every possible next tag
            # shape: (batch_size, num_tags, 1)
            broadcast_score = score.unsqueeze(2)

            # Broadcast emission score for every possible current tag
            # shape: (batch_size, 1, num_tags)
            broadcast_emissions = emissions[i].unsqueeze(1)

            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
            # for each sample, entry at row i and column j stores the sum of scores of all
            # possible tag sequences so far that end with transitioning from tag i to tag j
            # and emitting
            # shape: (batch_size, num_tags, num_tags)
            next_score = broadcast_score + self.transitions + broadcast_emissions

            # Sum over all possible current tags, but we're in score space, so a sum
            # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
            # all possible tag sequences so far, that end in tag i
            # shape: (batch_size, num_tags)
            next_score = torch.logsumexp(next_score, dim=1)

            # Set score to the next score if this timestep is valid (mask == 1)
            # shape: (batch_size, num_tags)
            score = torch.where(mask[i].unsqueeze(1), next_score, score)

        # End transition score
        # shape: (batch_size, num_tags)
        score += self.end_transitions

        # Sum (log-sum-exp) over all possible tags
        # shape: (batch_size,)
        return torch.logsumexp(score, dim=1)

In [35]:
denominator

tensor([nan, nan, nan], device='cuda:0', grad_fn=<LogsumexpBackward0>)

In [36]:
numerator

tensor([-152.5824, -294.2303, -264.3447], device='cuda:0',
       grad_fn=<AddBackward0>)

In [38]:
llh = numerator - denominator

In [39]:
llh

tensor([nan, nan, nan], device='cuda:0', grad_fn=<SubBackward0>)

In [32]:
model.crf.start_transitions

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [28]:
attention_mask[0].all()

tensor(False, device='cuda:0')

In [None]:
# shape: (batch_size,)
numerator = self._compute_score(emissions, tags, mask)
# shape: (batch_size,)
denominator = self._compute_normalizer(emissions, mask)
# shape: (batch_size,)
llh = numerator - denominator

In [26]:
model.crf(logits, labels)

../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [0,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [27]:
labels

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [27]:
model.crf(logits, labels, mask=attention_mask, reduction="token_mean")

../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [0,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        # remove [CLS] token for CRF working properly
        logits = logits[:, 1:]
        attention_mask = attention_mask[:, 1:].type(torch.bool)
        labels = labels[:, 1:].clip(0)
        
        loss = None
        if labels is not None:
            loss = -self.crf(logits, labels, mask=attention_mask, reduction="token_mean")
            tags = self.crf.decode(logits, mask=attention_mask)
        else:
            tags = self.crf.decode(logits, mask=attention_mask)
        
        # list[list[int]] -> padded tensor
        _, seq_length = attention_mask.shape
        padded_tags = [tag + [-100] * (seq_length - len(tag)) for tag in tags]
        tags = torch.tensor(padded_tags, dtype=torch.long, device=logits.device)

In [11]:
batch.keys()

NameError: name 'batch' is not defined

In [30]:
out = model(**batch)

In [31]:
out.loss

tensor(nan, device='cuda:0', grad_fn=<NegBackward0>)

In [18]:
trainer.train()
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mt-ionov[0m ([33mdeeppavlov_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 104.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 104.12 MiB is free. Process 875765 has 4.49 GiB memory in use. Including non-PyTorch memory, this process has 11.30 GiB memory in use. Of the allocated memory 9.81 GiB is allocated by PyTorch, and 1.19 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model(model_save_path)

## Post-Evaluation

In [None]:
# Load model from saved if needed

model_checkpoint = "/archive/savkin/models/ner/PII Data Detection/deberta-base-4000"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
trainer = Trainer(
    args = TrainingArguments(output_dir="tmp_trainer", report_to="none"),
    model=model,
    data_collator=data_collator
)

In [None]:
run = wandb.init(name=f"{wandb_run_name}-post-evaluation", job_type="post-evaluation")

In [None]:
predictions = trainer.predict(tokenized_dataset["valid"])

## Log metrics depending on the threshold

In [None]:
thresholds = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99]
# thresholds = [0.5, 0.6, 0.7]

thresholded_metrics = {}
best_threshold = 0
for i, threshold in enumerate(thresholds):
    preds = predictions.predictions
    true_labels = predictions.label_ids
    thresholed_pred_labels = apply_threshold(preds, threshold, O_label_id)

    metrics = compute_metrics_from_labels(thresholed_pred_labels, true_labels)
    thresholded_metrics[threshold] = metrics
    f5 = metrics["overall_f5_score"]
    print(f"Threshold {threshold}, overall_f5_score = {f5}")

In [None]:
# Log metrics based on threshold
run.define_metric("threshold")
run.define_metric(f"thresholded_*", step_metric="threshold", summary="max")

for threshold, metrics in thresholded_metrics.items():
    for metric_name, metric in metrics.items():
        new_metric_name = f"thresholded_{metric_name}"
        run.log({new_metric_name: metric, "threshold": threshold})
        # print({new_metric_name: metric, "threshold": threshold})
    
thresholed_f5_scores = [metric["overall_f5_score"] for _, metric in thresholded_metrics.items()]
best_threshold_id = np.argmax(thresholed_f5_scores)
best_threshold = thresholds[best_threshold_id]
run.summary["best_overall_threshold"] = best_threshold
run.summary["best_overall_f5_score"] = thresholed_f5_scores[best_threshold_id]

## Aggregate subtoken-level predictions into word-level predictions

In [None]:
pred_probas = softmax(predictions.predictions, axis=-1).max(-1)
pred_labels = apply_threshold(predictions.predictions, best_threshold, O_label_id)
true_labels = predictions.label_ids

In [None]:
submission = {
    "row_id": [],
    "document": [],
    "token": [],
    "label": [],
    "subtoken_str": [],
    "word_str": [],
    "proba": []
}

for input_ids, word_ids, row_id, document, words, p_labels, p_probas in zip(tokenized_dataset["valid"]["input_ids"], 
                                                                            tokenized_dataset["valid"]["word_ids"], 
                                                                            valid_df.index, 
                                                                            valid_df["document"], 
                                                                            valid_df["tokens"], 
                                                                            pred_labels, 
                                                                            pred_probas):
    subtokens = tokenizer.convert_ids_to_tokens(input_ids)
    for subtoken_id, (subtoken, label_id, proba) in enumerate(zip(subtokens, p_labels, p_probas)):
        word_id = word_ids[subtoken_id]
        if label_id != -100 and label_id != O_label_id and word_id is not None: # ignore O-labels
            submission["row_id"].append(row_id)
            submission["document"].append(document)
            submission["token"].append(word_id)
            submission["label"].append(id2label[label_id])
            submission["subtoken_str"].append(subtoken)
            submission["word_str"].append(words[word_id])
            submission["proba"].append(proba)


df = pd.DataFrame().from_dict(submission).drop_duplicates().sort_values(by=["document", "token"])
# submission_df_subtoken_level = df[df["label"] != "O"].copy(deep=True)[["row_id", "document", "token", "label", "proba"]]
            
subtoken_df =  df[df["label"] != "O"].copy(deep=True)
subtoken_df.head()

In [None]:
def aggregate_subtokens(df, label_agg_type = "most_frequent", add_subtoken_info = False):
    df = df.reset_index()
    row = df.iloc[0]

    if add_subtoken_info:
        row["subtokens"] = df["subtoken_str"].agg(lambda x: x.tolist())
        row["probas"] = df["proba"].agg(lambda x: x.tolist())

    if label_agg_type == "most_frequent":
        row["label"] = df.groupby(["label"])["row_id"].count().sort_values().index[-1]
        row["agg_proba"] = df[df["label"] == row["label"]]["proba"].agg("mean")
    elif label_agg_type == "first":
        row["label"] = df["label"].agg(lambda x: x[0])
        row["agg_proba"] = df["proba"].agg(lambda x: x[0])
    elif label_agg_type == "max_proba":
        row["label"] = df.iloc[df["proba"].idxmax()]["label"]
        row["agg_proba"] = df["proba"].agg("max")
    
    return row

# submission_df = submission_df_subtoken_level.groupby(["document", "token"]) \
#                                             .apply(aggregate_subtokens, label_agg_type="most_frequent") \
#                                             .reset_index(drop=True) \
#                                             .drop(columns=["index", "proba", "agg_proba"])

word_df = subtoken_df.groupby(["document", "token"]) \
                     .apply(aggregate_subtokens, add_subtoken_info=True) \
                     .reset_index(drop=True) \
                     .drop(columns=["index", "subtoken_str", "proba"])
word_df.head()

## Logging word-level predictions

## Log word-level metrics

In [None]:
true_word_labels = valid_df["labels"].apply(lambda labels: [label2id[l] for l in labels]).tolist()

# Create a template filled with "O" label
pred_word_labels = valid_df["labels"].agg(lambda x: [O_label_id for _ in x]).tolist()

# Group words into documents and reorder documents according to validation dataset
original_document_order = valid_df["document"].tolist()
document_df = word_df[["document", "token", "label"]].groupby("document").agg(list)
reordered_document_df = document_df.reindex(original_document_order, fill_value=[])
 
# Add predictions to the template
for i, (_, row) in enumerate(reordered_document_df.iterrows()):
    if len(row["token"]) > 0:
        for token_id, l in zip(row["token"], row["label"]):
            pred_word_labels[i][token_id] = label2id[l]
    
word_level_metrics = compute_metrics_from_labels(pred_word_labels, true_word_labels)

In [None]:
run.define_metric(f"word_level*")
for metric_name, metric in word_level_metrics.items():
    new_metric_name = f"word_level_{metric_name}"
    run.summary[new_metric_name] = metric

## Log model mistakes

In [None]:
error_rows = []
for (_, valid_row), pred_doc_labels, true_doc_labels in zip(valid_df.iterrows(), pred_word_labels, true_word_labels): 

    pred_doc_labels = np.array(pred_doc_labels)
    true_doc_labels = np.array(true_doc_labels)
    errors_mask = pred_doc_labels != true_doc_labels

    if sum(errors_mask) == 0:
        continue

    words = (valid_row["tokens"])
    trailing_whitespaces = valid_row["trailing_whitespace"]
    doc_id = valid_row["document"]


    error_pred_labels = pred_doc_labels[errors_mask]
    error_true_labels = true_doc_labels[errors_mask]
    error_words = np.array(words)[errors_mask]
    error_word_ids = np.argwhere(errors_mask)


    target_vizualization = wandb.Html(visualize_ents(words, trailing_whitespaces, [id2label[l] for l in true_doc_labels]))
    pred_vizualization = wandb.Html(visualize_ents(words, trailing_whitespaces, [id2label[l] for l in pred_doc_labels]))

    row = {}
    for w, w_id, p_l, t_l in zip(error_words, error_word_ids, error_pred_labels, error_true_labels):
        w_id = w_id[0]
        pred_row = word_df[word_df["document"] == doc_id]
        pred_row = pred_row[pred_row["token"] == w_id]

        row["document"] = doc_id
        row["word"] = w
        row["word_id"] = w_id
        row["pred_label"] = id2label[p_l]
        row["true_label"] = id2label[t_l]
        row["target_viz"] = target_vizualization
        row["pred_viz"] = pred_vizualization

        assert len(pred_row) <= 1

        if len(pred_row) == 1:       
            row["subtokens"] = pred_row["subtokens"].to_numpy().squeeze()
            row["probas"] = pred_row["probas"].tolist()[0]
            row["agg_proba"] = pred_row["agg_proba"].tolist()[0]
        elif len(pred_row) == 0:
            row["subtokens"] = None
            row["probas"] = None
            row["agg_proba"] = None
    error_rows.append(row)
    

error_df = pd.DataFrame().from_records(error_rows).sort_values(by=["document", "word_id"])
error_df.head()

In [None]:
error_table = wandb.Table(dataframe=error_df)
run.summary["error_table"] = error_table

In [None]:
wandb.finish()