In [1]:
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1'
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'

In [3]:
os.environ["WANDB_PROJECT"] = "PII Data Detection"
os.environ["WANDB_ENTITY"] = "deeppavlov_team"
os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["WANDB_WATCH"] = "all"

In [4]:
import copy
import gc
import json
import os
import re
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets
from spacy.lang.en import English
from torch import nn
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers.data.data_collator import DataCollatorForTokenClassification
from transformers.models.deberta_v2 import (
    DebertaV2ForTokenClassification,
    DebertaV2TokenizerFast,
)
from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers.trainer import Trainer
from transformers.trainer_utils import EvalPrediction
from transformers.training_args import TrainingArguments

import wandb
import random

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
SEED = 42  
def seed_everywhere(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everywhere(SEED)

In [6]:
MAX_LENGTH = 1024
CONF_THRESH = 0.9
LR = 2.5e-5
LR_SCHEDULER_TYPE = "linear"
NUM_EPOCHS = 3
BATCH_SIZE = 1
EVAL_BATCH_SIZE = 8
GRAD_ACCUMULATION_STEPS = 16 // BATCH_SIZE
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
AMP = True
FREEZE_EMBEDDING = False
FREEZE_LAYERS = 6
N_SPLITS = 4
NEGATIVE_RATIO = 0.3  # down sample ratio of negative samples in the training set
OUTPUT_DIR = "output"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

## Dataset

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
ext = pd.read_json('/archive/savkin/parsed_datasets/NER/PII_Data_Detection/mpware_mixtral8x7b_v1.1-no-i-username.json')

In [9]:
og = pd.read_json('/archive/savkin/parsed_datasets/NER/PII_Data_Detection/train.json')
train, val = train_test_split(og, test_size=0.1, stratify=og.labels.apply(lambda x: set(x) == set(['O'])))

In [10]:
train = pd.concat([ext, train])

In [11]:
train['document'] = train['document'].astype('str')
val['document'] = val['document'].astype('str')

In [12]:
all_labels = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS','B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-USERNAME', 'O']

In [13]:
id2label = {0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}

In [14]:
label2id = {v:k for k,v in id2label.items()}

In [15]:
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
# MODEL_NAME = "google/gemma-2b"

In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [17]:
def tokenize(example, tokenizer, label2id, max_length):
    text = []

    # these are at the character level
    labels = []

    for t, l, ws in zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"]):

        text.append(t)
        labels.extend([l]*len(t))

        # if there is trailing whitespace
        if ws:
            text.append(" ")
            labels.append("O")


    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False, max_length=max_length)

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:

        # CLS token
        if start_idx + end_idx == 0: 
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1
        
        while start_idx >= len(labels):
            start_idx -= 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {
        **tokenized,
        "labels": token_labels,
        "length": length
    }

In [18]:
train_ds = Dataset.from_dict({
    "full_text": train.full_text.values,
    "document": train.document.values,
    "tokens": train.tokens.values,
    "trailing_whitespace": train.trailing_whitespace.values,
    "provided_labels": train.labels.values, # renamed
})

In [19]:
val_ds = Dataset.from_dict({
    "full_text": val.full_text.values,
    "document": val.document.values,
    "tokens": val.tokens.values,
    "trailing_whitespace": val.trailing_whitespace.values,
    "provided_labels": val.labels.values, # renamed
})

In [20]:
train_ds = train_ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": MAX_LENGTH}, num_proc=3)

Map (num_proc=3): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8818/8818 [00:23<00:00, 369.56 examples/s]


In [21]:
val_ds = val_ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": MAX_LENGTH}, num_proc=3)

Map (num_proc=3): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 681/681 [00:02<00:00, 333.54 examples/s]


In [22]:
x = train_ds[0]

for t,l in zip(x["tokens"], x["provided_labels"]):
    if l != "O":
        print((t,l))

print("*"*100)

for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t,id2label[l]))

('Danielle.Johnson@sanchez-taylor.com', 'B-EMAIL')
('156018640', 'B-USERNAME')
('DLLT40781618495931', 'B-ID_NUM')
('3268542351', 'B-PHONE_NUM')
('http://www.instagram.com/zlawrence', 'B-URL_PERSONAL')
('600', 'B-STREET_ADDRESS')
('Jeffery', 'I-STREET_ADDRESS')
('Parkways', 'I-STREET_ADDRESS')
('New', 'I-STREET_ADDRESS')
('Jamesside', 'I-STREET_ADDRESS')
(',', 'I-STREET_ADDRESS')
('MT', 'I-STREET_ADDRESS')
('29394', 'I-STREET_ADDRESS')
('Danielle', 'B-NAME_STUDENT')
('Johnson', 'I-NAME_STUDENT')
('Danielle', 'B-NAME_STUDENT')
('Johnson', 'I-NAME_STUDENT')
('Danielle', 'B-NAME_STUDENT')
('Johnson', 'I-NAME_STUDENT')
('Danielle', 'B-NAME_STUDENT')
('Johnson', 'I-NAME_STUDENT')
('Danielle', 'B-NAME_STUDENT')
('Johnson', 'I-NAME_STUDENT')
****************************************************************************************************
('▁Daniel', 'B-EMAIL')
('le', 'B-EMAIL')
('.', 'B-EMAIL')
('John', 'B-EMAIL')
('son', 'B-EMAIL')
('@', 'B-EMAIL')
('s', 'B-EMAIL')
('anche', 'B-EMAIL')
('z

## Competition metrics

In [23]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
import evaluate
seqeval_metrics = evaluate.load("seqeval")

def f5_score(precision, recall):
    return (1 + 5*5) * recall * precision / (5*5*precision + recall + 1e-100)

def compute_metrics_from_labels(predictions, labels, id2label=id2label):
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval_metrics.compute(predictions=true_predictions, references=true_labels)
    for label, scores in results.items():
        if "overall" not in label:
            precision = scores["precision"]
            recall = scores["recall"]
            results[label]["f5_score"] = f5_score(precision, recall)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    results["overall_f5_score"] = f5_score(precision, recall)

    return results


def compute_metrics(eval_preds):
    logits, labels = eval_preds

    predictions = np.argmax(logits, axis=-1)

    return compute_metrics_from_labels(predictions, labels)

In [24]:
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
from transformers import MistralModel, MistralPreTrainedModel, BitsAndBytesConfig, AutoModelForCausalLM
from typing import Optional, Union, List, Tuple
from torch.nn import CrossEntropyLoss
from peft import get_peft_model, LoraConfig, TaskType
from peft import prepare_model_for_kbit_training

In [25]:
#!pip install peft

In [26]:
BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
PEFT_CONFIG = LoraConfig(task_type=None, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.0, bias="none")

In [28]:
class MistralForTokenClassification(MistralPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        config.quantization_config = BNB_CONFIG
        
        mistral = AutoModelForCausalLM.from_config(config)
        mistral = prepare_model_for_kbit_training(mistral)

        self.mistral = get_peft_model(mistral, PEFT_CONFIG)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.loss_fn = CrossEntropyLoss()

        self.init_weights()

    def forward(
        self,
        input_ids: torch.Tensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mistral(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [29]:
import transformers; transformers.__version__

'4.39.3'

In [30]:
# pip install -U transformers==4.35

In [31]:
# !pip install -i https://pypi.org/simple/ bitsandbytes

In [32]:
# !pip install accelerate

In [33]:
# mistral = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=BNB_CONFIG, resume_download=True)

In [34]:
torch.cuda.empty_cache()

In [39]:
model = MistralForTokenClassification.from_pretrained(
    MODEL_NAME,
)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.42it/s]
Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['model.classifier.bias', 'model.classifier.weight', 'model.mistral.base_model.model.lm_head.weight', 'model.mistral.base_model.model.model.embed_tokens.weight', 'model.mistral.base_model.model.model.layers.0.input_layernorm.weight', 'model.mistral.base_model.model.model.layers.0.mlp.down_proj.weight', 'model.mistral.base_model.model.model.layers.0.mlp.gate_proj.weight', 'model.mistral.base_model.model.model.layers.0.mlp.up_proj.

In [36]:
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [40]:
wandb_run_name = f"mistral-peft-{MAX_LENGTH}"

In [41]:
# torch.cuda.empty_cache()

In [42]:
print(torch.cuda.device_count())

2


In [43]:
args = TrainingArguments(
    output_dir=f'training_logs/{wandb_run_name}', 
    run_name=wandb_run_name,
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    report_to="wandb",
    evaluation_strategy="steps",
    save_strategy='steps',
    save_steps=500,
    eval_steps=500,
    logging_steps=50,
    do_eval=True,
    save_total_limit=3,
    lr_scheduler_type='cosine',
    metric_for_best_model="overall_f5_score",
    load_best_model_at_end=True,
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [44]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mt-ionov[0m ([33mdeeppavlov_team[0m). Use [1m`wandb login --relogin`[0m to force relogin




OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1561, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/tmp/ipykernel_3721170/250094790.py", line 32, in forward
    outputs = self.mistral(
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/peft/peft_model.py", line 563, in forward
    return self.get_base_model()(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 1157, in forward
    outputs = self.model(
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 1042, in forward
    layer_outputs = decoder_layer(
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 770, in forward
    hidden_states = self.mlp(hidden_states)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ionov/anaconda3/envs/kaggle/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 179, in forward
    return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 60.00 MiB. GPU 0 has a total capacity of 39.39 GiB of which 53.50 MiB is free. Process 1370598 has 4.05 GiB memory in use. Including non-PyTorch memory, this process has 35.26 GiB memory in use. Of the allocated memory 34.23 GiB is allocated by PyTorch, and 238.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [None]:
# trainer.save_model(f"nuner_{MAX_LEG}")
# tokenizer.save_pretrained("mistal-peft")

In [79]:
del model
del trainer
torch.cuda.empty_cache()