In [12]:
# model and output directory
L_Model = "roberta-base"
output_dir = "model/Roberta_clinc_small"

# dataset 
dataset_name = 'clinc_oos'
dataset_subset = 'small'

# device
device = 'cuda:0'

In [13]:
# load train and validation dataset 
from datasets import load_dataset

dataset = load_dataset(dataset_name, dataset_subset)
dataset = dataset.rename_column("intent", "label")
train_data, valid_data = dataset['train'], dataset['validation']

Found cached dataset clinc_oos (/work/pi_adrozdov_umass_edu/vpamidimukka_umass_edu/hf_cache/datasets/clinc_oos/small/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
# generating labels

labels = train_data.features["label"].names
label2id = {labels[i] : i for i in range(len(labels))}
id2label = {i: labels[i]  for i in range(len(labels))}

In [15]:
# preprocessing: 
# convert text --> ids
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [16]:
# Initialise tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(L_Model)

In [17]:
# Tokenize train and validation dataset
train_data = train_data.map(preprocess_function, batched=True)
valid_data = valid_data.map(preprocess_function, batched=True)

Loading cached processed dataset at /work/pi_adrozdov_umass_edu/vpamidimukka_umass_edu/hf_cache/datasets/clinc_oos/small/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1/cache-2e95d7b0c2fff21f.arrow
Loading cached processed dataset at /work/pi_adrozdov_umass_edu/vpamidimukka_umass_edu/hf_cache/datasets/clinc_oos/small/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1/cache-ffef38dc9d3674db.arrow


In [18]:
# data collator to form a batch from list of training dataset
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
# Evaluate metrics
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis= 1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
# Define model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    L_Model, num_labels=len(labels), id2label=id2label, label2id=label2id, return_dict=True)

# load the model into GPU
model = model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [21]:
model.device

device(type='cuda', index=0)

In [22]:
# define training arguments
training_args = TrainingArguments(
    output_dir= output_dir,
    # overwrite_output_dir = True,
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    num_train_epochs=10,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# define trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = valid_data,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

# Train model
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.8609,1.231912,0.84
2,0.4058,0.375617,0.926452
3,0.1061,0.303746,0.942903
4,0.048,0.323877,0.946774
5,0.026,0.321241,0.950323
6,0.0257,0.371746,0.948387
7,0.0118,0.355282,0.94871
8,0.0125,0.396875,0.945484
9,0.0037,0.392367,0.94871
10,0.0055,0.394325,0.946452


TrainOutput(global_step=15200, training_loss=0.3972613959406552, metrics={'train_runtime': 1514.2424, 'train_samples_per_second': 50.19, 'train_steps_per_second': 10.038, 'total_flos': 611751420952770.0, 'train_loss': 0.3972613959406552, 'epoch': 10.0})