In [1]:
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1'
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'

In [3]:
os.environ["WANDB_PROJECT"] = "PII Data Detection"
os.environ["WANDB_ENTITY"] = "deeppavlov_team"
os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["WANDB_WATCH"] = "all"

In [4]:
import copy
import gc
import json
import os
import re
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets
from spacy.lang.en import English
from torch import nn
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers.data.data_collator import DataCollatorForTokenClassification
from transformers.models.deberta_v2 import (
    DebertaV2ForTokenClassification,
    DebertaV2TokenizerFast,
)
from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers.trainer import Trainer
from transformers.trainer_utils import EvalPrediction
from transformers.training_args import TrainingArguments

import wandb
import random

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
TRAINING_MODEL_PATH = "numind/NuNER-v0.1"
OUTPUT_DIR = "/archive/ionov/pii/output"

In [6]:
MAX_LENGTH = 4000
CONF_THRESH = 0.9
LR = 2.5e-5
LR_SCHEDULER_TYPE = "linear"
NUM_EPOCHS = 3
BATCH_SIZE = 1
EVAL_BATCH_SIZE = 8
GRAD_ACCUMULATION_STEPS = 16 // BATCH_SIZE
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
AMP = True
FREEZE_EMBEDDING = False
FREEZE_LAYERS = 6
N_SPLITS = 4
NEGATIVE_RATIO = 0.3  # down sample ratio of negative samples in the training set
OUTPUT_DIR = "output"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

In [10]:
# !pip install seqeval

In [11]:
SEED = 42  
def seed_everywhere(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everywhere(SEED)

## 🗺️ Data Selection and Label Mapping
- As mentioned before, I additionaly use the moth dataset

In [12]:
ext =  json.load(open('/archive/savkin/parsed_datasets/NER/PII_Data_Detection/mpware_mixtral8x7b_v1.1.json'))

In [13]:
stay_columns = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels']

In [14]:
ext = pd.DataFrame.from_records(ext)

In [15]:
ext['has_ents'] = ext.labels.apply(lambda x: set(x) == set(['O']))

In [16]:
ext.has_ents.value_counts(1)

has_ents
False    0.862927
True     0.137073
Name: proportion, dtype: float64

In [17]:
orig = json.load(open('/archive/savkin/parsed_datasets/NER/PII_Data_Detection/orig_train_custom_split.json'))

In [18]:
orig = pd.DataFrame(orig)

In [19]:
train = orig[orig.valid == False]
val = orig[orig.valid == True]

In [20]:
pos = train[train.has_ents]
neg = train[~train.has_ents].sample(len(pos))
train = pd.concat([pos, neg, ext])

In [21]:
train.has_ents.value_counts(1)

has_ents
False    0.758602
True     0.241398
Name: proportion, dtype: float64

In [31]:
train['document'] = train['document'].astype('str')
val['document'] = val['document'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val['document'] = val['document'].astype('str')


In [23]:
all_labels = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS','B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-USERNAME', 'O']

In [24]:
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'I-USERNAME', 13: 'O'}


In [25]:
def tokenize(example, tokenizer, label2id, max_length):

    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length)

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [29]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

In [32]:
train_ds = Dataset.from_dict({
    "full_text": train.full_text.values,
    "document": train.document.values,
    "tokens": train.tokens.values,
    "trailing_whitespace": train.trailing_whitespace.values,
    "provided_labels": train.labels.values, # renamed
})

In [33]:
val_ds = Dataset.from_dict({
    "full_text": val.full_text.values,
    "document": val.document.values,
    "tokens": val.tokens.values,
    "trailing_whitespace": val.trailing_whitespace.values,
    "provided_labels": val.labels.values, # renamed
})

In [34]:
train_ds = train_ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": MAX_LENGTH}, num_proc=3)

Map (num_proc=3):   0%|                                                                                                                                                                                                                                              | 0/3778 [00:00<?, ? examples/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Map (num_proc=3):   0%|▍                                                                                                                                                                                                                                     | 7/3778 [00:00<04:26, 14.17 examples/s]Truncation was not explicitly activated b

In [35]:
val_ds = val_ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": MAX_LENGTH}, num_proc=3)

Map (num_proc=3):   0%|                                                                                                                                                                                                                                              | 0/1690 [00:00<?, ? examples/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Map (num_proc=3):   0%|▊                                                                                                                                                                                                                                     | 6/1690 [00:00<01:44, 16.09 examples/s]Truncation was not explicitly activated b

In [36]:
x = train_ds[0]

for t,l in zip(x["tokens"], x["provided_labels"]):
    if l != "O":
        print((t,l))

print("*"*100)

for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t,id2label[l]))

('Paola', 'B-NAME_STUDENT')
('Garcia', 'I-NAME_STUDENT')
****************************************************************************************************
('Ċ', 'B-NAME_STUDENT')
('Pa', 'B-NAME_STUDENT')
('ola', 'B-NAME_STUDENT')
('ĠGarcia', 'I-NAME_STUDENT')


## 🧮 Competition metrics
- Note that we are not using the normal F1 score.
- Although it is early in the competition, there are plenty of discsussions already explaining this:
- e.g., here: https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/470024

In [38]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
import evaluate
seqeval_metrics = evaluate.load("seqeval")

def f5_score(precision, recall):
    return (1 + 5*5) * recall * precision / (5*5*precision + recall + 1e-100)

def compute_metrics_from_labels(predictions, labels, id2label=id2label):
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval_metrics.compute(predictions=true_predictions, references=true_labels)
    for label, scores in results.items():
        if "overall" not in label:
            precision = scores["precision"]
            recall = scores["recall"]
            results[label]["f5_score"] = f5_score(precision, recall)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    results["overall_f5_score"] = f5_score(precision, recall)

    return results


def compute_metrics(eval_preds):
    logits, labels = eval_preds

    predictions = np.argmax(logits, axis=-1)

    return compute_metrics_from_labels(predictions, labels)

In [39]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at numind/NuNER-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
wandb_run_name = f"nuner-base-{MAX_LENGTH}-downsample"

In [41]:
args = TrainingArguments(
    output_dir=f'training_logs/{wandb_run_name}', 
    run_name=wandb_run_name,
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=120,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    report_to="wandb",
    evaluation_strategy="steps",
    save_strategy='steps',
    save_steps=1000,
    eval_steps=500,
    logging_steps=250,
    do_eval=True,
    save_total_limit=3,
    lr_scheduler_type='cosine',
    metric_for_best_model="overall_f5_score",
    load_best_model_at_end=True,
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [42]:
%%time
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mt-ionov[0m ([33mdeeppavlov_team[0m). Use [1m`wandb login --relogin`[0m to force relogin




RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1561, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 1390, in forward
    outputs = self.roberta(
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 801, in forward
    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
RuntimeError: The expanded size of the tensor (1136) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [2, 1136].  Tensor sizes: [1, 514]


In [None]:
trainer.save_model(f"nuner_{MAX_LEG}")
tokenizer.save_pretrained("deberta3base_1024")

In [79]:
del model
del trainer
torch.cuda.empty_cache()