We will start with the microsoft/deberta-v3-base model. And the MRPC subset of GLUE.

In [1]:
# !pip install transformers
# !pip install datasets
# !pip3 install torch torchvision
# ! pip install ipywidgets widgetsnbextension pandas-profiling
# ! pip install accelerate -U
# !pip install evaluate
!pip install scikit-learn



## Load Model
Here are the documentation:
https://huggingface.co/transformers/v4.9.1/model_doc/deberta_v2.html
For config:
https://huggingface.co/docs/transformers/v4.41.3/en/main_classes/configuration#transformers.PretrainedConfig



In [2]:
# AutoModelForSequenceClassification is adding a classification head on top of the pretrained model. 
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
MODEL_NAME = "microsoft/deberta-v3-base"
config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=False)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


*2024.06.17* I enountered a issue while loading the tokenizer, here is a post that solved the problem (make sure to restart the kernal): https://discuss.huggingface.co/t/error-with-new-tokenizers-urgent/2847  
The "use_fast" parameter is from this post.  
Another question is whether we should set the number of labels to be 1 or 2, since this is looking like a binary classification problem. But according to this website, 2 is also acceptable:  
https://stackoverflow.com/questions/71768061/huggingface-transformers-classification-using-num-labels-1-vs-2

## Load Data

In [3]:
from datasets import load_dataset
datasets = load_dataset("nyu-mll/glue", "mrpc")

In [4]:
# First take a sample of the data
train_dataset = datasets['train']
sample_1 = train_dataset[0]
sample_1

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

### Representation
The first problem I face is how to represent paraphrase: Paraphrase data is made of two sentences and a label indicationg whether they are paraphrase. So the label is clearly the output, what should the input look like? Here we found an good answer: https://huggingface.co/transformers/v3.0.2/glossary.html#token-type-ids

In [5]:
# !pip install numpy==1.26.4

*2024.06.17* I encounter a problem: when I try to convert dataset to torch format, and then I try to access the first item of the train set, it shows this error:   
ValueError: Unable to avoid copy while creating an array as requested.  
So I found a solution in this website  
https://support.gurobi.com/hc/en-us/articles/25787048531601-Compatibility-issues-with-numpy-2-0#:~:text=ValueError%3A%20Unable%20to%20avoid%20copy,4).  
I fixed the problem by downgrading numpy.

In [6]:
import torch
from torch.utils.data import DataLoader
# tokenize the entire dataset: I make sure we pad every sentence (pair) to token length of 102
# I first use tokenizer() to tokenize the entire train,test,val set separately, and see that the maximum length of tokens is 102. 
def tokenize(sample):
    tokenized_dataset = tokenizer(
        sample['sentence1'],
        sample['sentence2'],  
        truncation=True,               # Truncate sequences longer than the model's max length
        padding='max_length',          # Pad to the maximum length
        max_length = 102,              # I tried to pad them separately and see that the max length of token is 102
        return_token_type_ids=True,    # Return token type IDs
        return_attention_mask=True,    # Return attention mask
    )
    return tokenized_dataset

tokenized_datasets = datasets.map(tokenize, batched =True) 
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1','sentence2','idx'])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

## Train the model

In [7]:
import evaluate
import numpy as np
metric = evaluate.load("accuracy",)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

def record_config(sub_dir, hyper_parameters):
    # record the hyper parameters
    hyper_parameters_file = os.path.join(sub_dir, 'hyper_parameters.json')
    with open(hyper_parameters_file, 'w') as f:
        json.dump(hyper_parameters, f)

def record_metrics(sub_dir, records, stage='train'):
    json_file = os.path.join(sub_dir,f'{stage}_metrics.json')
    with open(json_file, 'w') as f:
        json.dump(records, f, indent=4)


In [8]:
# use the EvalPrediction as in the trainer class's compute_metrics parameter. 
from transformers import EvalPrediction
# to calculate the cross entropy

import torch.nn.functional as F

def train_epochs(model, dataloaders, num_train_epochs, device, optimizer, scheduler):
    model.to(device)
    records = []
    for epoch in range(num_train_epochs):
        for step, batch in enumerate(dataloaders['train']):
            # put everything on the right device
            batch =  {k: v.to(device) for k, v in batch.items()}
            # clear gradients, same old as usual
            optimizer.zero_grad()
            # the batch already includes the 'labels' (y) and the input_ids, masks, input_type_ids (x). 
            outputs = model(**batch)
            # outputs.loss might be problematic because of the NllLossBackward0 without softmax, should use nn.CrossEntropy
            loss = F.cross_entropy(input=outputs.logits, target=batch['labels'])
            loss.backward()
            # back propagation
            optimizer.step()
            # scheduler adjust lr
            scheduler.step()
            # record the loss and accuracy
            pred_and_tags = EvalPrediction(predictions=outputs.logits.detach().numpy(), label_ids=batch['labels'])
            record = {"epoch":epoch, "batch":step, "loss":loss.item(), "accuracy":compute_metrics(pred_and_tags)['accuracy']}
            print(record)
            records.append(record)
    return model, records




In [14]:
# construct training arguments, for now I am just changing  batch-size, number of epochs, learning rate, scheduler.
from transformers import get_scheduler
from torch.optim import AdamW
import os
import json

def train_model(model, tokenized_datasets, hyper_parameters, tuned_parameters):
    # get device
    device = torch.device(hyper_parameters['device'])
    
    # batch size: the name is a bit weird because TrainingArguments can do it on multiple GPUs
    per_device_train_batch_size = hyper_parameters['per_device_train_batch_size']
    # initialize dataloaders
    dataloaders = {}
    dataloaders['train'] = DataLoader(tokenized_datasets['train'], batch_size=per_device_train_batch_size, shuffle=True)
    # dataloaders['test'] = DataLoader(tokenized_datasets['test'], batch_size=per_device_train_batch_size)
    # dataloaders['validation'] = DataLoader(tokenized_datasets['validation'], batch_size=per_device_train_batch_size)
    
    # initialize optimizer
    learning_rate = hyper_parameters['learning_rate']
    optimizer = AdamW(model.parameters(),lr=learning_rate)  # Here `model` is assumed to be instantiated
    
    # scheduler
    lr_scheduler_type = hyper_parameters['lr_scheduler_type']
    lr_scheduler_kwargs = { 'optimizer':optimizer, **hyper_parameters['lr_scheduler_kwargs']}
    scheduler = get_scheduler(lr_scheduler_type, **lr_scheduler_kwargs)

    # number of epochs
    num_train_epochs = hyper_parameters['num_train_epochs']
    
    # Record the hyper parameters for this training
    output_dir = hyper_parameters['output_dir']
    # Create the base directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Create the subdirectory for the hyperparameters
    sub_dir = os.path.join(output_dir, '_'.join([f"{k}={hyper_parameters[k]}" for k in tuned_parameters]))
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)
    record_config(sub_dir, hyper_parameters)

    # train the model on the hyper parameters
    model, records = train_epochs(model, dataloaders, num_train_epochs, device, optimizer, scheduler)
    # save training metrics and model
    record_metrics(sub_dir, records)
    model.save_pretrained(sub_dir)
    tokenizer.save_pretrained(sub_dir)

In [16]:
# calculate the keyword args for schedular: I don't want to tune this, so set them by recomended.
train_sample_size = len(tokenized_datasets['train'])
num_train_epochs = 5
per_device_train_batch_size = 32
num_training_steps = int((train_sample_size/per_device_train_batch_size) * num_train_epochs)
num_warmup_steps = int(0.1 * num_training_steps)


# I will keep the name of hyper parameters consistent with TrainingArguments so that later I can switch to it. 
hyper_parameters = {
    'learning_rate': 5e-05,
    'per_device_train_batch_size': per_device_train_batch_size,
    'lr_scheduler_type': 'linear',
    'output_dir' : 'deberta_output/',
    'num_train_epochs': num_train_epochs,
    'lr_scheduler_kwargs': {'num_warmup_steps':num_warmup_steps,
                            'num_training_steps':num_training_steps},
    'device': "cuda" if torch.cuda.is_available() else "cpu"
}

tuned_parameters = ['learning_rate','per_device_train_batch_size', 'lr_scheduler_type']


train_model(model, tokenized_datasets, hyper_parameters, tuned_parameters)

{'epoch': 0, 'batch': 0, 'loss': 0.7226404547691345, 'accuracy': 0.375}
{'epoch': 0, 'batch': 1, 'loss': 0.7395396828651428, 'accuracy': 0.28125}
{'epoch': 1, 'batch': 0, 'loss': 0.7397270798683167, 'accuracy': 0.28125}
{'epoch': 2, 'batch': 0, 'loss': 0.7262541651725769, 'accuracy': 0.34375}
{'epoch': 3, 'batch': 0, 'loss': 0.7126044631004333, 'accuracy': 0.40625}
{'epoch': 4, 'batch': 0, 'loss': 0.7263943552970886, 'accuracy': 0.3125}
