In [1]:
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

In [3]:
raw_datasets = load_dataset("glue", "mrpc")

In [4]:
checkpoint = "bert-base-uncased"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [6]:
def tokenizer_func(data):
    """
    Since this is the next sentance prediction fine-tuning, we combine sentence 1 and sentence 2
    """
    return tokenizer(data["sentence1"], data["sentence2"], truncation=True)

In [7]:
tokenized_datasets = raw_datasets.map(tokenizer_func, batched=True)

Map: 100%|██████████| 408/408 [00:00<00:00, 20889.86 examples/s]


In [29]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [8]:
# next we use the DataCollator for dynamic padding across batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
### Training a transformer model
"""
The first step is to define the Trainer and TrainingArguments that will contain all the hyperparameters. The trainer will be used for
training and evaluation. 
"""

'\nThe first step is to define the Trainer and TrainingArguments that will contain all the hyperparameters. The trainer will be used for\ntraining and evaluation. \n'

In [10]:
from transformers import TrainingArguments # note: trainer requires accelerator package

In [11]:
!pip install "transformers[torch]"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [12]:
training_arguments = TrainingArguments("test-trainer")

In [None]:
# Note: to save publish ur model to hub use , set
# training_arguments.push_to_hub = True

In [13]:
# defining the model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# The above warning means that the base model was pre-trained for sentance classification, and since we are using for next sentance prediction, bert automatically
# gives removed the head with a random head and random weights, and suggests training the model with new data

from transformers import Trainer

trainer = Trainer(
    model, 
    training_arguments,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)


* 'schema_extra' has been renamed to 'json_schema_extra'


In [15]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 36%|███▋      | 500/1377 [01:44<02:55,  4.99it/s]

{'loss': 0.6397, 'grad_norm': 2.1331958770751953, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


 73%|███████▎  | 1000/1377 [03:23<01:15,  4.99it/s]

{'loss': 0.6052, 'grad_norm': 10.576410293579102, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


100%|██████████| 1377/1377 [04:43<00:00,  4.85it/s]

{'train_runtime': 283.7954, 'train_samples_per_second': 38.774, 'train_steps_per_second': 4.852, 'train_loss': 0.5812389343306044, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.5812389343306044, metrics={'train_runtime': 283.7954, 'train_samples_per_second': 38.774, 'train_steps_per_second': 4.852, 'total_flos': 405114969714960.0, 'train_loss': 0.5812389343306044, 'epoch': 3.0})

In [16]:
"""
We didnt tell the trainer how to evaluate the training with an evaluation strategy to either "steps" or "epochs".
We also didnt provide the trainer with compute metrics, which will provide more insight into the trainig process
"""

predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 51/51 [00:03<00:00, 15.75it/s]

(408, 2) (408,)





In [18]:
predictions.label_ids

array([1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,

In [17]:
predictions.metrics

{'test_loss': 0.44859522581100464,
 'test_runtime': 5.7796,
 'test_samples_per_second': 70.593,
 'test_steps_per_second': 8.824}

In [19]:
# before converting logits to prediction, checking how does each row look like, these are logits and we can use argmax or even softmax
predictions.predictions[0]

array([-1.251398 ,  2.1139903], dtype=float32)

In [20]:
# Note: All transformer models return logits, to transform them into true prediction that we can compare to our labels
# We need the index with the maximum value on the second axis.

import numpy as np

preds = np.argmax(predictions.predictions, axis=-1) # note: since the shape of the

In [21]:
# evaluating the model
import evaluate

metrics = evaluate.load("glue", "mrpc") # these are the benchmarks
metrics.compute(predictions=preds, references=predictions.label_ids)

Downloading builder script: 100%|██████████| 5.75k/5.75k [00:00<00:00, 19.0MB/s]


{'accuracy': 0.803921568627451, 'f1': 0.8639455782312925}

In [26]:
# building a wrapper for the compute metrics
def compute_metrics(eval_preds):
    metrics = evaluate.load("glue","mrpc")
    logits, labels = eval_preds # looks like eval_preds = [predictions, labels_ids] 
    preds = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=preds, references=labels)

In [None]:
# defining a trainer with eval strategy, and eval computation metrics.

In [24]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")

In [27]:
trainer = Trainer(
    model,
    training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [28]:
trainer.train() # this should report compute metrics at each epoch (evaluation strategy)

                                                  
 33%|███▎      | 459/1377 [01:35<03:31,  4.35it/s]

{'eval_loss': 0.4523482322692871, 'eval_accuracy': 0.8112745098039216, 'eval_f1': 0.8756058158319872, 'eval_runtime': 2.5512, 'eval_samples_per_second': 159.925, 'eval_steps_per_second': 19.991, 'epoch': 1.0}


 36%|███▋      | 500/1377 [01:43<02:56,  4.98it/s]

{'loss': 0.432, 'grad_norm': 0.2898280918598175, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


                                                  
 67%|██████▋   | 918/1377 [03:09<01:48,  4.23it/s]

{'eval_loss': 0.6826484799385071, 'eval_accuracy': 0.8308823529411765, 'eval_f1': 0.882051282051282, 'eval_runtime': 2.4221, 'eval_samples_per_second': 168.446, 'eval_steps_per_second': 21.056, 'epoch': 2.0}


 73%|███████▎  | 1000/1377 [03:25<01:10,  5.33it/s]

{'loss': 0.2168, 'grad_norm': 0.0269420575350523, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


                                                   
100%|██████████| 1377/1377 [04:42<00:00,  4.88it/s]

{'eval_loss': 1.0774508714675903, 'eval_accuracy': 0.8161764705882353, 'eval_f1': 0.878048780487805, 'eval_runtime': 2.3821, 'eval_samples_per_second': 171.276, 'eval_steps_per_second': 21.409, 'epoch': 3.0}
{'train_runtime': 282.2585, 'train_samples_per_second': 38.986, 'train_steps_per_second': 4.879, 'train_loss': 0.26291740711131817, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.26291740711131817, metrics={'train_runtime': 282.2585, 'train_samples_per_second': 38.986, 'train_steps_per_second': 4.879, 'total_flos': 405114969714960.0, 'train_loss': 0.26291740711131817, 'epoch': 3.0})