In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
import torch
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')
torch.cuda.is_available()

True

In [2]:
# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# !pip install datasets
from datasets import load_dataset
imdb = load_dataset("imdb")

In [4]:
# Let's have a look at the data
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

### Pre-processing the data

In [5]:
# Splitting train and test set
# train_data = imdb["train"]
# test_data = imdb["test"]

# Using a subset of the dataset to speed up training
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [6]:
# Building a BERT based tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [7]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

To speed up training, using a data_collator to convert training samples to PyTorch tensors and concatenating them with the correct amount of padding

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Training

 We will be throwing away the pretraining head of the BERT model and replacing it with a classification head fine-tuned for sentiment analysis. This enables us to transfer the knowledge from BERT to our custom model

For training, we will be using the Trainer API, which is optimized for fine-tuning Transformers🤗 models such as DistilBERT, BERT and RoBERTa.

First, let's define BERT as our base model:

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Then, let's define the metrics we will be using to evaluate how good is our fine-tuned model (accuracy and f1 score):

In [10]:
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

Defining our training arguments and creating a Trainer with all the objects constructed till now

In [11]:
# !pip install accelerate -U
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
   output_dir='./',
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [12]:
# Calling train() method of our trainer to fine-tune the model on our dataset
trainer.train()

Step,Training Loss


TrainOutput(global_step=376, training_loss=0.2920035910099111, metrics={'train_runtime': 591.6631, 'train_samples_per_second': 10.141, 'train_steps_per_second': 0.635, 'total_flos': 1554673892799360.0, 'train_loss': 0.2920035910099111, 'epoch': 2.0})

In [13]:
# Next, let's compute the evaluation metrics to see how good our model is
trainer.evaluate()

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.35070809721946716,
 'eval_accuracy': 0.8866666666666667,
 'eval_f1': 0.8866666666666667,
 'eval_runtime': 11.4181,
 'eval_samples_per_second': 26.274,
 'eval_steps_per_second': 1.664,
 'epoch': 2.0}