In [1]:
!pip --q install transformers[sentencepiece]
!pip --q install datasets

In [2]:
#@ IMPORTING THE REQUIRED LIBRAREIS AND DEPENDCIENCES
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

#@ LOADING THE MODEL CHECKPOINT FOR PRETRAINED MODEL AND TOKENIZER
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

#@ CREATING A LIST OF SEQEUNCES
sequences = [
    "I've been waiting for a HuggingFace course my whole life",
    "This course is amazing"
]

batch = tokenizer(sequences,                                    # Using the tokenizer to encode the sequences
                  padding=True,
                  truncation=True,
                  return_tensors="pt")

batch["labels"] = torch.tensor([1,1])                           # Setting up the labels for sequences
optimizer = torch.optim.AdamW(model.parameters())               # Creating an optimizer for the model
loss = model(**batch).loss                                      # Calculating the loss
loss.backward()                                                 # Backpropagate the loss through the model
optimizer.step()                                                # Update the model parameters

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Getting the dataset**

In [3]:
#@ LOADING THE DATASET FROM HUGGING FACE DATASETS LIBRARY
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
raw_datasets["train"][5]

{'sentence1': 'Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .',
 'sentence2': "With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .",
 'label': 1,
 'idx': 5}

In [5]:
#@ ACCESSING EACH PAIR OF SENTENCES
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [6]:
#@ CHECKING LABELS CORRESPONDS
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

## Preprocessing a dataset

- To preprocess the dataset, we need to convert the text to numbers the model can make sense of for this we use `tokenizer`

- We can feed tokenizer one sentence or list of sentence, so we can directly tokenize all the first sentence and all the second sentences of each pair

In [7]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences2 = tokenizer(raw_datasets["train"]["sentence2"])

In [8]:
#@ PREPROCESSING THE TRAINING DATASET
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [9]:
#@ IMPLEMENTING TOKENIZER:
inputs = tokenizer("This is a first sentence",
                   "This is a second sentence")                              # Tokenization.
inputs

{'input_ids': [101, 2023, 2003, 1037, 2034, 6251, 102, 2023, 2003, 1037, 2117, 6251, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
#@ DEFINING THE TOKENIZATION FUNCTION
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

#@ IMPLEMENTATION OF TOKENIZATION FUNCTION
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

- It takes dictionary (the items of our dataset) and returns a new dictionary with the keys *input_ids*, *attention_mask*, *token_type_ids*

## Dynamic Padding

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
#@ IMPLEMENTING COLLATOR FUNCTION
samples = tokenized_datasets["train"][:8]           # create a list of 8 samples from train dataset
print(samples)

# remove idx, sentence1, sentence2 keys from samples
samples = {k:v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
print(samples)

# print length of each input id in sample dictionary
[len(x) for x in samples["input_ids"]]

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .", 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .', 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .', 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .', 'Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .', 'The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .', 'The DVD-CCA then appealed to the state Supreme Court .'], 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold 

[50, 59, 47, 67, 59, 50, 62, 32]

In [13]:
#@ IMPLEMENTATION OF COLLATOR FUNCTION
batch = data_collator(samples)                # create a batch data from sample object
{k:v.shape for k, v in batch.items()}         # print shape of each key in the batch

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

- Transformers provides `Trainer` class to help for fine tuning any of the pretrained models
- After completion of data preprocessing, we just need to dine `Trainer`

## **Training:**

- The first step is to define our `Trainer` is to define `TrainerArguments` to provide a directory where the trained model will be saved, as well as checkpoint.

In [14]:
#@ THE TRAINER API
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

In [15]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments("test-trainer",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  weight_decay=0.01)

training_args = TrainingArguments(checkpoint)

In [17]:
#@ DEFINING THE TRAINER
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()



Step,Training Loss
500,0.5784
1000,0.3827


TrainOutput(global_step=1377, training_loss=0.4180120608039585, metrics={'train_runtime': 193.2389, 'train_samples_per_second': 56.945, 'train_steps_per_second': 7.126, 'total_flos': 405324636337200.0, 'train_loss': 0.4180120608039585, 'epoch': 3.0})

In [18]:
#@ EVALUATING THE MODEL
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)


In [19]:
#@ TRANSFORMING LOGITS INTO PREDICTIONS
import numpy as np
preds = np.argmax(predictions.predictions, axis=1)

In [20]:
!pip --q install evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/81.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/81.4 kB[0m [31m440.3 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m71.7/81.4 kB[0m [31m631.1 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m657.9 kB/s[0m eta [36m0:00:00[0m
[?25h

In [21]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.8259803921568627, 'f1': 0.8802698145025295}

In [22]:
#@ WRAPPING UP EVERYTHING TOGETHER - COMPUTING METRICS
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits,labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [25]:
#@ DEFINING THE NEW TRAINER:
training_args = TrainingArguments("test-trainer",
                                  evaluation_strategy="epoch")
trainer = Trainer(model,
                  training_args,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["validation"],
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.765806,0.791667,0.860427
2,0.264900,0.936632,0.821078,0.8832
3,0.161500,0.88382,0.830882,0.877876


TrainOutput(global_step=1377, training_loss=0.18514188546240978, metrics={'train_runtime': 221.3381, 'train_samples_per_second': 49.716, 'train_steps_per_second': 6.221, 'total_flos': 405540469624800.0, 'train_loss': 0.18514188546240978, 'epoch': 3.0})