In [None]:
# Write code to fine-tune BERT to the IMDB dataset for sentiment analysis.
# Divide the training dataset into training/validation (90%/10%)
# Fine-tune for 10 epochs measuring the validation performance after each epoch. Visualize the performance in a plot.
# How many epochs takes to reach the best validation performance?
# Evaluate on the test dataset. How validation and test performances differ?

# install datasets
!pip install datasets

In [3]:
#sentiment analysis fine tuning
from datasets import list_datasets, load_dataset

from pprint import pprint

In [5]:
# Currently available datasets
ds = load_dataset("stanfordnlp/imdb")
ds_train = load_dataset("stanfordnlp/imdb", split='train[:90%]')
ds_validation = load_dataset("stanfordnlp/imdb", split='train[-10%:]')
ds_test = load_dataset("stanfordnlp/imdb", split='test')
# print(ds_train)
# print(ds_validation)
# print(ds_test)


In [6]:
# But first see BERT tokenizer exmaples and other required stuff!
from transformers import AutoTokenizer, DataCollatorWithPadding
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print(ds_validation)
example='In this Kaggle notebook, I will do sentiment analysis using BERT with Huggingface'
tokens=tokenizer.tokenize(example)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
# print(tokens)
# print(token_ids)

Dataset({
    features: ['text', 'label'],
    num_rows: 2500
})


In [7]:
from os import truncate
#To do all preprocessing you specify some parameters in the encod_plus() method of the tokenizer
encoding = tokenizer.encode_plus(
  example,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  padding=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
  truncation=True
)

encoding.keys()
#The tokens ids list
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]
#The attentions masked tokens
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']
#let's see how the sentence is tokenized with bert tokenizer
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])


18
18


['[CLS]',
 'in',
 'this',
 'ka',
 '##ggle',
 'notebook',
 ',',
 'i',
 'will',
 'do',
 'sentiment',
 'analysis',
 'using',
 'bert',
 'with',
 'hugging',
 '##face',
 '[SEP]']

In [None]:
#In order to get the suitable sequence length we need to take a look at all reviews length and then select the most appropriate one
import seaborn as sns
import matplotlib.pyplot as plt
token_lens = []

for txt in ds['train']['text']:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))
sns.histplot(token_lens)
plt.xlim([0, 500]);
plt.xlabel('Token count')

In [8]:
from transformers import AutoTokenizer, DataCollatorWithPadding

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_ds_train = ds_train.map(tokenize_function, batched=True)
tokenized_ds_validation = ds_validation.map(tokenize_function, batched=True)
tokenized_ds_test = ds_test.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [9]:
from datasets import ClassLabel, Value
def process_tokenized_datasets(tokenized_ds):
  tokenized_ds = tokenized_ds.remove_columns(["text"])
  tokenized_ds = tokenized_ds.rename_column("label", "labels")
  new_features = tokenized_ds.features.copy()
  new_features["labels"] = Value("float")
  tokenized_ds = tokenized_ds.cast(new_features)
  tokenized_ds.set_format("torch")
  tokenized_ds.column_names
  return tokenized_ds

tokenized_ds_train_processed = process_tokenized_datasets(tokenized_ds_train)
tokenized_ds_validation_processed = process_tokenized_datasets(tokenized_ds_validation)
tokenized_ds_test_processed = process_tokenized_datasets(tokenized_ds_test)
print(tokenized_ds_train_processed['labels'])
print(tokenized_ds_validation_processed)
print(tokenized_ds_test_processed)

Casting the dataset:   0%|          | 0/2500 [00:00<?, ? examples/s]

tensor([0., 0., 0.,  ..., 1., 1., 1.])
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2500
})
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})


In [10]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_ds_train_processed, batch_size=8, collate_fn=data_collator
)
validation_dataloader = DataLoader(
    tokenized_ds_validation_processed, batch_size=8, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_ds_test_processed, batch_size=8, collate_fn=data_collator
)
print(validation_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7c1707d44790>


In [11]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 495]),
 'token_type_ids': torch.Size([8, 495]),
 'attention_mask': torch.Size([8, 495])}

In [14]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)
print(outputs)

tensor(0.1614, grad_fn=<MseLossBackward0>) torch.Size([8, 1])
SequenceClassifierOutput(loss=tensor(0.1614, grad_fn=<MseLossBackward0>), logits=tensor([[0.4713],
        [0.3351],
        [0.3344],
        [0.4614],
        [0.3716],
        [0.5595],
        [0.2892],
        [0.3122]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [16]:
from transformers import AdamW
import torch
#change the optimizer to PyTorch AdamW implementation

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [17]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(validation_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

939


In [18]:
#The training loop

import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/8439 [00:00<?, ?it/s]