# Notebook de referência

Nome:

## Instruções:


Treinar e medir a acurácia de um modelo BERT (ou variantes) para classificação binária usando o dataset do IMDB (20k/5k amostras de treino/validação).

Importante:
- Deve-se implementar o próprio laço de treinamento.
- Implementar o acumulo de gradiente.

Dicas:
- BERT geralmente costuma aprender bem uma tarefa com poucas épocas (de 3 a 5 épocas). Se tiver demorando mais de 5 épocas para chegar em 80% de acurácia, ajuste os hiperparametros.

- Solução para erro de memória:
  - Usar bfloat16 permite quase dobrar o batch size

Opcional:
- Pode-se usar a função trainer da biblioteca Transformers/HuggingFace para verificar se seu laço de treinamento está correto. Note que ainda assim é obrigatório implementar o laço próprio.

# Fixando a seed

In [5]:
import random
import torch
import torch.nn.functional as F
import numpy as np
import os

In [6]:
random.seed(123)
np.random.seed(123)
torch.manual_seed(123)

<torch._C.Generator at 0x1c526a5ebf0>

## Preparando Dados

Primeiro, fazemos download do dataset:

In [7]:
if not os.path.isfile("aclImdb.tgz"):
    !curl -LO http://files.fast.ai/data/aclImdb.tgz
    !tar -xzf aclImdb.tgz

## Carregando o dataset

Criaremos uma divisão de treino (20k exemplos) e validação (5k exemplos) artificialmente.

In [8]:
import os

max_valid = 5000

def load_texts(folder):
    texts = []
    for path in os.listdir(folder):
        with open(os.path.join(folder, path), encoding="utf-8") as f:
            texts.append(f.read())
    return texts

x_train_pos = load_texts('./aclImdb/train/pos')
x_train_neg = load_texts('./aclImdb/train/neg')
x_test_pos = load_texts('./aclImdb/test/pos')
x_test_neg = load_texts('./aclImdb/test/neg')

x_train = x_train_pos + x_train_neg
x_test = x_test_pos + x_test_neg
y_train = [True] * len(x_train_pos) + [False] * len(x_train_neg)
y_test = [True] * len(x_test_pos) + [False] * len(x_test_neg)

# Embaralhamos o treino para depois fazermos a divisão treino/valid.
c = list(zip(x_train, y_train))
random.shuffle(c)
x_train, y_train = zip(*c)

x_valid = x_train[-max_valid:]
y_valid = y_train[-max_valid:]
x_train = x_train[:-max_valid]
y_train = y_train[:-max_valid]

print(len(x_train), 'amostras de treino.')
print(len(x_valid), 'amostras de desenvolvimento.')
print(len(x_test), 'amostras de teste.')

print('3 primeiras amostras treino:')
for x, y in zip(x_train[:3], y_train[:3]):
    print(y, x[:100])

print('3 últimas amostras treino:')
for x, y in zip(x_train[-3:], y_train[-3:]):
    print(y, x[:100])

print('3 primeiras amostras validação:')
for x, y in zip(x_valid[:3], y_test[:3]):
    print(y, x[:100])

print('3 últimas amostras validação:')
for x, y in zip(x_valid[-3:], y_valid[-3:]):
    print(y, x[:100])

20000 amostras de treino.
5000 amostras de desenvolvimento.
25000 amostras de teste.
3 primeiras amostras treino:
False POSSIBLE SPOILERS<br /><br />The Spy Who Shagged Me is a muchly overrated and over-hyped sequel. Int
False The long list of "big" names in this flick (including the ubiquitous John Mills) didn't bowl me over
True Bette Midler showcases her talents and beauty in "Diva Las Vegas". I am thrilled that I taped it and
3 últimas amostras treino:
False I was previously unaware that in the early 1990's Devry University (or was it ITT Tech?) added Film 
True The story and music (George Gershwin!) are wonderful, as are Levant, Guetary, Foch, and, of course, 
True This is my favorite show. I think it is utterly brilliant. Thanks to David Chase for bringing this i
3 primeiras amostras validação:
True Why has this not been released? I kind of thought it must be a bit rubbish since it hasn't been. How
True I was amazingly impressed by this movie. It contained fundamental elements of

# Tokenizer

In [9]:
# tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased')

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

tokens = tokenizer(x_train[0], add_special_tokens=True, padding=True, truncation=True)

# print(len(tokens["input_ids"]), len(x_train[0].split()))
CLS, SEP = int(tokens["input_ids"][0]), int(tokens["input_ids"][-1])
print(f"Start of sequence token: {CLS}, End of sequence token: {SEP}")
len(tokens["input_ids"])

Start of sequence token: 101, End of sequence token: 102


512

# Dataset / Dataloader

In [10]:
from torch.utils.data import Dataset, DataLoader

class IMDB(Dataset):
    def __init__(self, X, Y, tokenizer):
        super().__init__()

        # Tokenize input
        self.tokenized_data = tokenizer(X, add_special_tokens=True, padding=True, truncation=True, return_tensors="pt")

        # Input and Label
        self.X = self.tokenized_data["input_ids"]
        self.Y = torch.tensor(Y, dtype=torch.float32)

        # Attention Mask of the Input
        self.mask = self.tokenized_data["attention_mask"]

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        assert len(self.X[idx]) == 512, "Bad Input"
        assert len(self.mask[idx]) == 512, "Bad Mask"
        
        return self.X[idx], self.mask[idx], self.Y[idx]

train_dataset = IMDB(x_train, y_train, tokenizer)
valid_dataset = IMDB(x_valid, y_valid, tokenizer)
test_dataset = IMDB(x_test, y_test, tokenizer)

In [11]:
batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Modelo

In [12]:
from transformers import DistilBertModel

class BinaryClassifier(torch.nn.Module):

    def __init__(self, dropout=0):
        super().__init__()
        
        self.bert_model = DistilBertModel.from_pretrained('distilbert-base-cased')
        
        self.dropout = torch.nn.Dropout(dropout)
        self.linear = torch.nn.Linear(768, 1)

    def forward(self, X, mask=None):
        output = self.bert_model(input_ids=X, attention_mask=mask)
        C = output.last_hidden_state[:, 0]

        y = self.dropout(C)
        y = self.linear(y)

        return y


model = BinaryClassifier()

sample_X, sample_mask, sample_label = next(iter(train_dataloader))
print(f"input_ids shape {sample_X.shape}, attention mask shape {sample_mask.shape}")
sample_output = model(sample_X, sample_mask)
print(f"sample_output shape {sample_output.shape}, sample_label shape {sample_label.shape}")

input_ids shape torch.Size([32, 512]), attention mask shape torch.Size([32, 512])
sample_output shape torch.Size([32, 1]), sample_label shape torch.Size([32])


# Treino

## Funções Auxiliares do Elton

Peguei as funções auxíliares do Elton pois achei bem bonito a maneira que ele mostra as informações.
Ele também coda bem melhor que eu, se comprar nossas maneiras de computar a loss o código dele está bem mais organizado e documentado.

In [15]:
# ------------------------ Start of Elton's code ------------------------ #
from typing import Tuple, Optional
import tqdm # The use of TQDM makes training way more user friendly.

def ppl(loss:torch.Tensor) -> torch.Tensor:
    """
    Computes the perplexity from the loss.

    Args:
        loss (torch.Tensor): loss to compute the perplexity.

    Returns:
        torch.Tensor: corresponding perplexity.
    """
    return torch.exp(loss)

def print_info(loss_value:torch.Tensor, epoch:int, total_epochs:int, 
               time:float=0.0, accuracy:Optional[float]=None):
    """
    Prints the information of a epoch.

    Args:
        loss_value (torch.Tensor): epoch loss.
        epoch (int): epoch number.
        total_epochs (int): total number of epochs. 
        time (float, optional): time to run the epoch. Don't print if is 0.0. Defaults to 0.0.
        accuracy (float, optional): epoch accuracy.
    """
    ppl_value = ppl(loss_value)

    
    print(f'Epoch [{epoch+1}/{total_epochs}], \
            Loss: {loss_value.item():.4f}, \
            Perplexity: {ppl_value.item():.4f}', end="")
    
    if accuracy is not None:
        print(f', Accuracy: {100*accuracy:.4f}%')

    if time != 0:
        print(f", Elapsed Time: {time:.2f} sec")    
    else:
        print("")

MODE_TRAIN = 0
MODE_EVALUATE = 1

def compute_loss(model:torch.nn.Module, loader:DataLoader, 
                 criterion:torch.nn.Module, mode:int = MODE_EVALUATE, 
                 optimizer:Optional[torch.optim.Optimizer]=None, 
                 accumulation_steps:Optional[int] = 1) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Computes the loss from a model across a dataset.

    If in train mode also runs optimizer steps.

    Args:
        model (torch.nn.Module): model to evaluate.
        loader (DataLoader): dataset.
        criterion (torch.nn.Module): loss function to compute.
        mode (int): mode of the computation. 
                    If MODE_EVALUATE, computes without gradient, in eval mode and detachs loss.
                    If MODE_TRAIN, computes with gradient and in train mode.
                    Default is MODE_EVALUATE.
        optimizer (torch.optim.Optimizer, optional): optimizer to use in the train mode.

    Returns:
        torch.Tensor: resulting loss.
        torch.Tensor: resulting accuracy
    """
    device = next(iter(model.parameters())).device

    if mode == MODE_EVALUATE:
        model.eval()
        torch.set_grad_enabled(False)
    elif mode == MODE_TRAIN:
        model.train()
        torch.set_grad_enabled(True)
        optimizer.zero_grad()
    else:
        raise ValueError(f"Unknown mode: {mode}.")

    batch_index = 0
    total_loss = torch.tensor(0, dtype=torch.float32, device=device)
    correct = torch.tensor(0, dtype=torch.float32, device=device)
    n = 0
    for inputs, masks, targets in tqdm.tqdm(loader):
        inputs = inputs.to(device)
        masks = masks.to(device)

        targets = targets.reshape(-1)
        targets = targets.to(device)
        
        logits = model(inputs, masks)
        logits = logits.view(-1, logits.shape[-1])

        loss : torch.Tensor = criterion(logits.squeeze(), targets)
        total_loss += loss*targets.size(0)
        
        predicted = torch.round(torch.sigmoid(logits.squeeze()))
        correct += (predicted == targets).sum().item()

        n += targets.size(0)

        if mode == MODE_TRAIN:
            loss /= accumulation_steps
            loss.backward()

            if ((batch_index+1) % accumulation_steps == 0) or (batch_index+1 == len(loader)):
                optimizer.step()
                optimizer.zero_grad()

        batch_index += 1

    total_loss /= n 
    accuracy = correct / n
    
    torch.set_grad_enabled(True)

    accuracy = accuracy.detach()
    total_loss = total_loss.detach()

    return total_loss, accuracy

# ------------------------ End of Elton's code ------------------------ #
# Thanks Elton :D

## Laço de Treinamento

In [16]:
# What device are we using?
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print('using CPU')

GPU: NVIDIA GeForce RTX 3060 Ti


In [20]:
import time
from time import strftime, localtime

accumulation_steps = 8
lr = 2e-5
n_epoch = 3

model = BinaryClassifier()
model.to(device)

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# First Epoch
first_loss, first_accuracy = compute_loss(model, valid_dataloader, criterion, MODE_EVALUATE)
print_info(loss_value=first_loss, epoch=-1, total_epochs=n_epoch, accuracy=first_accuracy)

for epoch in range(n_epoch):
    start = time.time() 

    loss_train, accuracy_train = compute_loss(model, train_dataloader, criterion, MODE_TRAIN, optimizer, accumulation_steps)
    ppl_train = ppl(loss_train)

    print_info(loss_train, epoch, n_epoch, time.time() - start, accuracy_train)

    loss_val, accuracy_val = compute_loss(model, valid_dataloader, criterion, MODE_EVALUATE)
    ppl_val = ppl(loss_val)
    
    print("VALIDATION INFO", end=" ")
    print_info(loss_val, epoch, n_epoch, accuracy=accuracy_val)

model_name = f"BINARY_CLASSIFIER_BERT_{strftime('%Y-%m-%d_%H-%M-%S', localtime())}"
torch.save(model, model_name)

  2%|▏         | 3/157 [00:44<37:59, 14.80s/it]


KeyboardInterrupt: 