In [1]:
import os
import json
import pandas as pd
import numpy as np
import pprint as pp
import pickle as pkl

import torch
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import TextClassificationPipeline, pipeline, set_seed
from transformers import DebertaTokenizer, DebertaForSequenceClassification, DebertaConfig
from datasets import load_from_disk

In [2]:
PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
SHAP_DIR = os.path.join(PROJECT_DIR, "classification/shap_values/coqa")
SPLITS_DIR = os.path.join(PROJECT_DIR, "classification/split_datasets/coqa")
MODELS_DIR = os.path.join(PROJECT_DIR, "classification/models")
OUTPUT_DIR = os.path.join(PROJECT_DIR, 'classification/preds')
LOGS_DIR = os.path.join(PROJECT_DIR, 'logs')

raw_dataset = load_from_disk(SPLITS_DIR)

!export "CUDA_VISIBLE_DEVICES"=2 jupyter notebook
!export "CUDA_VISIBLE_DEVICES"=2
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

In [3]:
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:200"

In [4]:
set_seed(42)

label_mapping = {'False': 0, 'True': 1}

#model_name = 'microsoft/deberta-v3-small'
model_name = 'microsoft/deberta-v3-base'

model_config = AutoConfig.from_pretrained(model_name,
                                          num_labels=len(label_mapping),
                                          hidden_dropout_prob=0.3,
                                          attention_probs_dropout_prob=0.3)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.to(device)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

### Transformers Setup

In [5]:
def tokenize_and_mask(raw_data):

    '''Tokenize
    Normal padding: set padding='max_length' and max_length=int (default is 512)
    Dynamic padding: set padding=False and (later in the Trainer) pass `data_collator=DataCollatorWithPadding(tokenizer)
    result will be a dict with keys 'input_ids', 'attention_mask'
    '''
    result = tokenizer(raw_data["text"],
                       max_length=512,
                       truncation=True,
                       #padding='max_length'
                       )

    '''Add labels'''
    if label_mapping is not None:
        if "label" in raw_data:
            result['labels'] = [label_mapping[str(label)] for label in raw_data["label"]]
    
    return result

In [6]:
'''Prepare inputs: tokenize and mask'''
dataset = raw_dataset.map(tokenize_and_mask, batched=True)

for split in ['train', 'validation', 'test']:
    dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [7]:
from transformers import default_data_collator, DataCollatorWithPadding

data_collator_dynamic_padding = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

#### Run on single GPU 

Discussion and possible solutions here:
https://github.com/huggingface/transformers/issues/12570

with latest version (transformers-4.34.0.dev0): 
AttributeError: 'customTrainingArguments' object has no attribute 'distributed_state' (for both solutions)

downgrade to:
pip install --upgrade transformers==4.29.2

In [8]:
class customTrainingArguments(TrainingArguments):
    def __init__(self,*args, **kwargs):
        super(customTrainingArguments, self).__init__(*args, **kwargs)

    @property
    #@torch_required
    def device(self) -> "torch.device":
        """
        The device used by this process.
        Name the device the number you use.
        """
        return torch.device("cuda:2")

    @property
    #@torch_required
    def n_gpu(self):
        """
        The number of GPUs used by this process.
        Note:
            This will only be greater than one when you have multiple GPUs available but are not using distributed
            training. For distributed training, it will always be 1.
        """
        # Make sure `self._n_gpu` is properly setup.
        # _ = self._setup_devices
        # I set to one manullay
        self._n_gpu = 1
        return self._n_gpu

### Training Parameters

In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [10]:
from transformers import Trainer, AdamW, get_cosine_schedule_with_warmup, EarlyStoppingCallback

#training_args = TrainingArguments(
training_args = customTrainingArguments( # use customArg to enforce using one GPU
    output_dir=OUTPUT_DIR,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=0.0001,
    warmup_steps=200,
    weight_decay=0.05,
    do_train=True,
    do_eval=True,
    logging_dir=LOGS_DIR,
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=50,
    load_best_model_at_end=True, # needed for early stopping
    metric_for_best_model='accuracy', # needed for early stopping
)

optimizer = AdamW(
    model.parameters(), 
    lr=training_args.learning_rate,
    betas=(0.9, 0.98),
    eps=1e-6, # numerical stability
)

total_steps = len(dataset['train']) // training_args.per_device_train_batch_size * training_args.num_train_epochs
print(f"Number of training steps: {total_steps}")

scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=training_args.warmup_steps, 
    num_training_steps=total_steps,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args, # defined above or v1_training_args
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=data_collator_dynamic_padding, # default_data_collator or data_collator_dynamic_padding
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)],
)

Number of training steps: 5000




### Finetune + evaluate

In [11]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB (GPU 2; 10.75 GiB total capacity; 9.25 GiB already allocated; 93.62 MiB free; 9.92 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [12]:
len(dataset['train']) // training_args.per_device_train_batch_size # steps per epoch

500

In [None]:
def get_report(preds, split):
    y_trues = [0 if raw_dataset[split]['label'][i]==False else 1 for i in range(len(raw_dataset[split]))]
    y_preds = preds.predictions.argmax(-1)
    print(classification_report(y_trues, y_preds, labels=[0,1]))

In [None]:
split = 'validation'
get_report(trainer.predict(dataset[split]), split)

              precision    recall  f1-score   support

           0       0.88      0.60      0.72       394
           1       0.79      0.95      0.86       606

    accuracy                           0.81      1000
   macro avg       0.83      0.78      0.79      1000
weighted avg       0.82      0.81      0.80      1000



In [None]:
split = 'test'
get_report(trainer.predict(dataset[split]), split)

              precision    recall  f1-score   support

           0       0.89      0.57      0.69       378
           1       0.78      0.96      0.86       622

    accuracy                           0.81      1000
   macro avg       0.84      0.76      0.78      1000
weighted avg       0.82      0.81      0.80      1000



In [None]:
'''Save model'''
import datetime
model_id = datetime.datetime.now().strftime("%d%m%H%M")
#trainer.save_model(os.path.join(MODELS_DIR, f"distilbert-base-uncased_{model_id}"))