https://towardsdatascience.com/fine-tuning-bert-and-roberta-for-high-accuracy-text-classification-in-pytorch-c9e63cf64646

In [16]:
import pandas as pd
import torch
import torch.nn as nn 
from torchtext.data import Field,TabularDataset,BucketIterator,Iterator
from transformers import AutoConfig,AutoTokenizer,AutoModelForSequenceClassification,AdamW,get_linear_schedule_with_warmup,AutoModel

import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

from tqdm.notebook import tqdm_notebook as tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import gc
gc.collect()
torch.cuda.empty_cache()

In [2]:
#  Check for CUDA
# Set random seed and set device to GPU.
torch.manual_seed(17)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Working on device: {device}, model: {torch.cuda.get_device_name(0)}')

Working on device: cuda, model: GeForce RTX 2060


In [3]:
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path='roberta-base')

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = 'roberta-base')

# model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path = 'bert-base-uncased',config = model_config)

In [4]:
# Set tokenizer hyperparameters.
MAX_SEQ_LEN = 256
BATCH_SIZE = 16
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

In [5]:
label = Field(sequential=False, use_vocab=False, batch_first=True)
text = Field(use_vocab=False, 
            tokenize=tokenizer.encode, 
            include_lengths=False, 
            batch_first=True,
            fix_length=MAX_SEQ_LEN, 
            pad_token=PAD_INDEX, 
            unk_token=UNK_INDEX)

In [6]:
fields = {'titletext' : ('titletext', text), 'label' : ('label', label)}

In [7]:
train_data,valid_data,test_data = TabularDataset.splits(path = '../Data/news/',train='news_train.csv',validation='news_validation.csv',test='news_test.csv',format = 'CSV',fields = fields,skip_header=False)

In [8]:
# Iterators

train_iter = BucketIterator(train_data,batch_size = BATCH_SIZE,sort_key = lambda x: len(x.titletext),
                            device = device,train=True,sort=True,sort_within_batch=False,shuffle=True)
valid_iter = BucketIterator(valid_data,batch_size = BATCH_SIZE,sort_key = lambda x: len(x.titletext),
                            device = device,train=True,sort=True,sort_within_batch=False,shuffle=True)
test_iter = Iterator(test_data,train=False,batch_size = BATCH_SIZE,device = device,sort=False, shuffle=False)

In [17]:
class NewsRoBerta(nn.Module):

    def __init__(self,n_classes):

        super(NewsRoBerta,self).__init__()
        self.roberta = AutoModel.from_pretrained("roberta-base")
        self.roberta_drop = nn.Dropout(0.3)
        self.out = nn.Linear(self.roberta.config.hidden_size,n_classes)

    def forward(self,input_ids,attention_mask):
        _,pooled_output  = self.roberta(input_ids = input_ids,attention_mask = attention_mask)
        output = self.roberta_drop(pooled_output)
        return self.out(output)

In [18]:
def pretrain(model,
             optimizer, 
             train_iter, 
             valid_iter, 
             scheduler = None,
             valid_period = len(train_iter),
             num_epochs = 5):


    for param in model.roberta.parameters():
       param.requires_grad=False
    
    model.train()

    train_loss = 0.0
    valid_loss = 0.0   
    global_step = 0  


    for epoch in tqdm(range(num_epochs)):
        for (source, target), _ in train_iter:
            mask = (source!=PAD_INDEX).type(torch.uint8)

            y_pred = model(input_ids =source,attention_mask = mask)

            loss = nn.CrossEntropyLoss()(y_pred, target)

            loss.backward()

            optimizer.step()    
            scheduler.step()
                
            optimizer.zero_grad()
            
            # Update train loss and global step
            train_loss += loss.item()
            global_step += 1

            if global_step % valid_period == 0:
                model.eval()

                with torch.no_grad():
                    for (source, target), _ in train_iter:
                        mask = (source!=PAD_INDEX).type(torch.uint8)

                        y_pred = model(input_ids =source,attention_mask = mask)

                        loss = nn.CrossEntropyLoss()(y_pred, target)

                        valid_loss+=loss.item()
                
                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_iter)

                model.train()

                # print summary
                print('Epoch [{}/{}], global step [{}/{}], Train Loss Loss: {:.4f}, Val Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              train_loss, valid_loss))

                train_loss = 0.0                
                valid_loss = 0.0

    
    # Set bert parameters back to trainable
    for param in model.roberta.parameters():
        param.requires_grad = True
        
    print('Pre-training done!')


In [19]:
def save_checkpoint(path, model, valid_loss):
    torch.save({'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}, path)

    
def load_checkpoint(path, model):    
    state_dict = torch.load(path, map_location=device)
    model.load_state_dict(state_dict['model_state_dict'])
    
    return state_dict['valid_loss']


def save_metrics(path, train_loss_list, valid_loss_list, global_steps_list):   
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, path)


def load_metrics(path):    
    state_dict = torch.load(path, map_location=device)
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [20]:
def train(model,
          optimizer,
          train_iter,
          valid_iter,
          scheduler = None,
          num_epochs = 5,
          valid_period = len(train_iter),
          output_path = '../model/NewsRoBerta/'):
    
    # Initialize losses and loss histories
    train_loss = 0.0
    valid_loss = 0.0
    train_loss_list = []
    valid_loss_list = []
    best_valid_loss = float('Inf')
    
    global_step = 0
    global_steps_list = []
    
    model.train()
    
    # Train loop
    for epoch in range(num_epochs):
        for (source, target), _ in train_iter:
            mask = (source != PAD_INDEX).type(torch.uint8)

            y_pred = model(input_ids=source,  
                           attention_mask=mask)
            #output = model(input_ids=source,
            #              labels=target,
            #              attention_mask=mask)
            
            loss = torch.nn.CrossEntropyLoss()(y_pred, target)
            #loss = output[0]
            
            loss.backward()
            
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            
            # Optimizer and scheduler step
            optimizer.step()    
            scheduler.step()
                
            optimizer.zero_grad()
            
            # Update train loss and global step
            train_loss += loss.item()
            global_step += 1

            # Validation loop. Save progress and evaluate model performance.
            if global_step % valid_period == 0:
                model.eval()
                
                with torch.no_grad():                    
                    for (source, target), _ in valid_iter:
                        mask = (source != PAD_INDEX).type(torch.uint8)

                        y_pred = model(input_ids=source, 
                                       attention_mask=mask)
                        #output = model(input_ids=source,
                        #               labels=target,
                        #               attention_mask=mask)
                        
                        loss = torch.nn.CrossEntropyLoss()(y_pred, target)
                        #loss = output[0]
                        
                        valid_loss += loss.item()

                # Store train and validation loss history
                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_iter)
                train_loss_list.append(train_loss)
                valid_loss_list.append(valid_loss)
                global_steps_list.append(global_step)

                # print summary
                print('Epoch [{}/{}], global step [{}/{}] | Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              train_loss, valid_loss))
                
                # checkpoint
                if best_valid_loss > valid_loss:
                    best_valid_loss = valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
                        
                train_loss = 0.0                
                valid_loss = 0.0
                model.train()
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Training done!')

In [21]:
NUM_EPOCHS = 6
steps_per_epoch = len(train_iter)

model = NewsRoBerta(n_classes=2)
model = model.to(device)

In [22]:
optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=steps_per_epoch*1, 
                                            num_training_steps=steps_per_epoch*NUM_EPOCHS)


In [23]:
pretrain(model=model,
         train_iter=train_iter,
         valid_iter=valid_iter,
         optimizer=optimizer,
         scheduler=scheduler,
         num_epochs=NUM_EPOCHS)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

Epoch [1/6], global step [221/1326], Train Loss Loss: 0.6809, Val Loss: 1.7065
Epoch [2/6], global step [442/1326], Train Loss Loss: 0.6857, Val Loss: 1.6808
Epoch [3/6], global step [663/1326], Train Loss Loss: 0.6904, Val Loss: 1.6324
Epoch [4/6], global step [884/1326], Train Loss Loss: 0.6920, Val Loss: 1.6063
Epoch [5/6], global step [1105/1326], Train Loss Loss: 0.6919, Val Loss: 1.5889
Epoch [6/6], global step [1326/1326], Train Loss Loss: 0.6888, Val Loss: 1.5871

Pre-training done!


In [25]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [26]:
optimizer = AdamW(model.parameters(), lr=2e-6)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=steps_per_epoch*2, 
                                            num_training_steps=steps_per_epoch*NUM_EPOCHS)

train(model=model, 
      train_iter=train_iter, 
      valid_iter=valid_iter, 
      optimizer=optimizer, 
      scheduler=scheduler, 
      num_epochs=NUM_EPOCHS)

RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 6.00 GiB total capacity; 4.44 GiB already allocated; 20.63 MiB free; 4.48 GiB reserved in total by PyTorch)