In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████████████████████| 408/408 [00:00<00:00, 17515.80 examples/s]


TrainingArguments class that will contain all the hyperparameters the Trainer will use for training and evaluation. The only argument you have to provide is a directory where the trained model will be saved, as well as the checkpoints along the way. For all the rest, you can leave the defaults, which should work pretty well for a basic fine-tuning

In [2]:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

[2023-11-27 21:03:02,161] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. 

In [3]:
# Define model
from transformers import AutoModelForSequenceClassification

model =  AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Default data_collator used by the Trainer will be a DataCollatorWithPadding as defined previously, so you can skip the line data_collator=data_collator in this call.

In [4]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer)

<code>trainer.train()</code> does not evaluate because:
* didn’t tell the Trainer to evaluate during training by setting evaluation_strategy to either "steps" (evaluate every eval_steps) or "epoch" (evaluate at the end of each epoch)
* didn’t provide the Trainer with a compute_metrics()

In [6]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

Evaluation: build a <code>compute_metrics()</code> function
* Take an <code>EvalPrediction</code> object
  * A named tuple with a <code>predictions</code> field and a <code>label_ids</code> field
* Returns dictionary mapping str to floats

In [10]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions)
print(predictions.predictions.shape, predictions.label_ids.shape)

PredictionOutput(predictions=array([[-3.0083747 ,  3.7227745 ],
       [ 2.5148568 , -3.685114  ],
       [-1.1738895 ,  1.4548098 ],
       [-2.991008  ,  3.710904  ],
       [ 2.386998  , -3.392853  ],
       [-2.9963045 ,  3.7029953 ],
       [-2.874949  ,  3.396956  ],
       [-2.983798  ,  3.6950772 ],
       [-3.0102117 ,  3.678574  ],
       [-2.9851115 ,  3.6694648 ],
       [-2.9817533 ,  3.6917982 ],
       [ 0.88159555, -1.7612852 ],
       [ 2.3181114 , -3.3687425 ],
       [-2.9794805 ,  3.7076566 ],
       [-2.9881742 ,  3.6889591 ],
       [ 2.1448307 , -3.1415138 ],
       [-2.9962258 ,  3.7037761 ],
       [ 2.536143  , -3.567515  ],
       [-2.9954028 ,  3.7002656 ],
       [ 1.8065165 , -2.748624  ],
       [ 1.8496939 , -2.8457537 ],
       [-2.9432857 ,  3.678389  ],
       [-2.8317778 ,  3.3713129 ],
       [-2.9940157 ,  3.711947  ],
       [-2.932481  ,  3.6088867 ],
       [-2.5428913 ,  3.2481332 ],
       [-2.3524632 ,  2.8629637 ],
       [-3.0189471 ,  3.70

<code>predict()</code> outputs named tuple with 3 fields: predictions, label_ids & metrics
* metrics: loss & time

In [13]:
# Convert predictions to labels

import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)
preds[:8]

array([1, 0, 1, 1, 0, 1, 1, 1])

In [14]:
predictions.label_ids[:8]

array([1, 0, 0, 1, 0, 1, 0, 1])

In [16]:
import evaluate

metric = evaluate.load("glue","mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Downloading builder script: 100%|██████████| 5.75k/5.75k [00:00<00:00, 2.41MB/s]


{'accuracy': 0.8235294117647058, 'f1': 0.8823529411764706}

Result varies due to random initialization of the model head

In [17]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue","mrpc") # get metric
    logits, labels = eval_preds # predicted logits & labels
    predictions = np.argmax(logits, axis=-1) # the argmax
    return metric.compute(predictions=predictions, references=labels)

In [18]:
training_args = TrainingArguments("test-trainer",evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.372311,0.835784,0.878403
2,No log,0.390433,0.855392,0.900169
3,0.401000,0.518771,0.865196,0.905009




TrainOutput(global_step=690, training_loss=0.33592849399732505, metrics={'train_runtime': 192.7801, 'train_samples_per_second': 57.081, 'train_steps_per_second': 3.579, 'total_flos': 430433242128000.0, 'train_loss': 0.33592849399732505, 'epoch': 3.0})