In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|████████████████████████| 3668/3668 [00:00<00:00, 14670.22 examples/s]
Map: 100%|██████████████████████████| 408/408 [00:00<00:00, 18665.55 examples/s]
Map: 100%|████████████████████████| 1725/1725 [00:00<00:00, 24191.68 examples/s]


* Define a **TrainingArguments** class
  * hyperparameters, directories etc

In [2]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

[2023-12-17 14:10:26,238] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


* A new classifier head is added, randomly iinitialized

In [4]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args, 
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer)

In [5]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msoonchangpoh[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.3775




TrainOutput(global_step=690, training_loss=0.3076247173806895, metrics={'train_runtime': 175.3183, 'train_samples_per_second': 62.766, 'train_steps_per_second': 3.936, 'total_flos': 430291408824720.0, 'train_loss': 0.3076247173806895, 'epoch': 3.0})

**compute_metric()**
* Input: EvalPrediction (tuple with **predictions** & **label_ids**)
* Return: Dictionary with key (name of metric) and val (float value)

In [6]:
predictions = trainer.predict(tokenized_datasets['validation'])
print(predictions.predictions.shape)
print(predictions.label_ids.shape)

(408, 2)
(408,)


In [7]:
predictions

PredictionOutput(predictions=array([[-2.8333833e+00,  2.8015301e+00],
       [ 2.9464595e+00, -3.0162218e+00],
       [ 1.9412735e+00, -1.6329376e+00],
       [-2.7756567e+00,  2.8011887e+00],
       [ 2.6230412e+00, -2.8055775e+00],
       [-2.7766528e+00,  2.7675545e+00],
       [-2.5183961e+00,  2.2145400e+00],
       [-2.7911282e+00,  2.7915554e+00],
       [-2.7402694e+00,  2.7176280e+00],
       [-2.7931893e+00,  2.7988048e+00],
       [-2.7837543e+00,  2.7844136e+00],
       [ 2.8804936e+00, -2.8534677e+00],
       [ 2.3489616e+00, -2.0654774e+00],
       [-2.7230930e+00,  2.7047725e+00],
       [-2.8122118e+00,  2.7996569e+00],
       [ 1.9923896e+00, -2.3156464e+00],
       [-2.7836628e+00,  2.7994080e+00],
       [ 1.6739997e+00, -1.3882061e+00],
       [-2.7840524e+00,  2.7974110e+00],
       [ 2.3737619e+00, -2.2511227e+00],
       [ 2.2953274e+00, -2.5455713e+00],
       [-2.7319446e+00,  2.6170747e+00],
       [ 2.0457239e+00, -2.2287965e+00],
       [-2.7596602e+00,  2.7

In [8]:
# transform predictions

import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [9]:
!pip install evaluate
!pip install sklearn scipy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting huggingface-hub>=0.7.0 (from evaluate)
  Using cached huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Using cached huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.16.4
    Uninstalling huggingface-hub-0.16.4:
      Successfully uninstalled huggingface-hub-0.16.4
Successfully installed huggingface-hub-0.19.4
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKE

In [10]:
import evaluate
metric = evaluate.load("glue","mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8553921568627451, 'f1': 0.899488926746167}

In [21]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue","mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [22]:
training_args = TrainingArguments("test-trainer",evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.358635,0.852941,0.894366
2,No log,0.489262,0.830882,0.8867
3,0.375100,0.585484,0.85049,0.895726




TrainOutput(global_step=690, training_loss=0.3027944703033005, metrics={'train_runtime': 182.0044, 'train_samples_per_second': 60.46, 'train_steps_per_second': 3.791, 'total_flos': 430433242128000.0, 'train_loss': 0.3027944703033005, 'epoch': 3.0})