The purpose of this notebook is to illustrate how a pre-trained large language model can be fine-tuned for a specific task. In short, we are using the pre-trained DistilBERT model, but with two additional linear layers added on that are not trained (the parameters for those two layers are initialized to random values). 

The original DistilBERT model was trained on ~16GB of data. The fine-tuning done here uses the the relatively small dataset of ~2500 observations, where each observation is roughly a paragraph of text. 

A good reference on fine-tuning a large language model:
* https://mccormickml.com/2019/07/22/BERT-fine-tuning/

In [None]:
import gc
import json
import torch
import itertools
import time
import datetime
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import numpy.ma as ma

import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler, random_split

#from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AdamW 
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
train_file = '../input/commonlitreadabilityprize/train.csv'

MAX_LENGTH = 256
BATCH_SIZE = 32

In [None]:
train_data = pd.read_csv(train_file)
print(f'train data shape: {train_data.shape}')

In [None]:
train_data.describe()

Download the DistilBERT tokenizer and model using the huggingface transformers module. Note that the num_labels parameter set to 1 indicates that we have a regression output (rather than classification). 

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                           num_labels=1,
                                                           output_attentions=False,
                                                           output_hidden_states=False)

The next two cells are simply to check out the data a little more. 

In [None]:
X = train_data.excerpt.values # X and y are both numpy arrays
y = train_data.target.values
print(X.shape)
print(y.shape)

In [None]:
print('original: \n', X[0])

print('\n\ntokenized: \n', tokenizer.tokenize(X[0]))
print('len(tokenized(X[0])): \n', len(tokenizer.tokenize(X[0])))

print('\n\ntoken IDs: \n', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(X[0])))

We set the max length already above, but this would be a check to see what the max length is over all observations (earlier we created a histogram of the lengths and determined 256 is a reasonable max length). 

In [None]:
observed_max_len = 0

# For every sentence...
for exc in X:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(exc, add_special_tokens=True)

    # Update the maximum sentence length.
    observed_max_len = max(observed_max_len, len(input_ids))

print('Max sentence length: ', observed_max_len) # max len in training data is 314, but 256 will fully cover most observations

Using the DistilBERT tokenizer to tokenize the raw data. 

In [None]:
# Tokenize all of excerpts and map their tokens to their word IDs
input_ids = []
attention_masks = []

# For every sentence...
for exc in X:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        exc,                       # Sentence to encode
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        truncation = True,
                        padding = 'max_length',
                        max_length = MAX_LENGTH,          # Pad & truncate all sentences        
                        #pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks
                        return_tensors = 'pt',     # Return pytorch tensors
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y).float()

In [None]:
# Print sentence 0, now as a list of IDs.
print('original X[0]: ', X[0])
print('\n\ntoken IDs for X[0]:', input_ids[0])

Split the data into training and validation sets. 

In [None]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split and calc sizes of each.
train_size = int(0.85 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. Smaller batch sizes are generally recommended for fine-tuning BERT 

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = BATCH_SIZE # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = BATCH_SIZE # Evaluate with this batch size.
        )

In the following cell we will look at all of the layers/cells in the model. 

In [None]:
model.cuda()

# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The DistilBERT model number of layers: {}.\n'.format(len(params)))

for i, p in enumerate(params):
    print("layer {:>3}: {:<55} {:>12}".format(i, p[0], str(tuple(p[1].size()))))

In [None]:
# Create our own optimizer that sets a different (much lower) learning rate for the layers 
# that are already pre-trained, and then a larger learning rate for the two final linear
# layers that have not been trained at all (but are instead initialized to random values).
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    bert_parameters = named_parameters[:100]
    regressor_parameters = named_parameters[100:]
        
    bert_group = [params for (name, params) in bert_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []

    #for layer_num, (name, params) in enumerate(bert_parameters):
    for name, params in bert_parameters:        
        lr = 1e-5
        parameters.append({"params": params,
                           "lr": lr})

    #for layer_num, (name, params) in enumerate(regressor_parameters):
    for name, params in regressor_parameters:
        lr = 1e-3 
        parameters.append({"params": params,
                           "lr": lr})

    return AdamW(parameters)

In [None]:
criterion = nn.MSELoss()

#optimizer = AdamW(model.parameters(),
#                  lr = 1e-5, # args.learning_rate - default is 5e-5
#                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
#                )
optimizer = create_optimizer(model)

In [None]:
EPOCHS = 4

# Number of training epochs does not need to be a lot for fine-tuning, 
# recommendations for BERT models are between 2-4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
    ''' Convert time in seconds and returns a string hh:mm:ss '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# Set the seed value all over the place to make this reproducible.
seed_val = 1

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.set_default_dtype(torch.float64)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, EPOCHS):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0
    batch_squared_errors = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    y_train = {'actual':[], 'predicted':[]}
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 25 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks, not needed for DistilBERT
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is sometimes desired 
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).        
        outputs = model(b_input_ids, 
                        labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += outputs[0].item()

        # Perform a backward pass to calculate the gradients.
        loss = criterion(outputs[1].flatten(), b_labels.float())#.sqrt()
        
        # backpropagation
        loss.backward()
        
        # for plotting results later on
        y_train['actual'] += b_labels.float().cpu().numpy().flatten().tolist()
        y_train['predicted'] += outputs[1].detach().cpu().numpy().flatten().tolist()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put model in evaluation mode (don't calculate gradients, no dropout, etc.)
    model.eval()

    # Tracking variables 
    batch_squared_errors = 0
    total_eval_loss = 0

    # Evaluate data for one epoch
    y_val = {'actual':[], 'predicted':[]}
    for step, batch in enumerate(validation_dataloader):
                
        # Progress update every 40 batches.
        if step % 5 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(validation_dataloader), elapsed))
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks, not needed for DistilBERT
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate predictions
            outputs = model(b_input_ids, 
                            labels=b_labels)

        # Accumulate the validation loss.
        loss = outputs[0]
        total_eval_loss += loss.item()
        
        # Move labels/targets and predictions to CPU
        preds = outputs[1].detach().cpu().numpy()
        targets = b_labels.to('cpu').numpy()
        
        # for plotting results later on
        y_val['actual'] += targets.flatten().tolist()
        y_val['predicted'] += preds.flatten().tolist()

        # Calculate MSE
        batch_squared_errors += np.square(targets - preds.flatten()).sum()

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
train_mse = mean_squared_error(y_train['predicted'], y_train['actual'])
valid_mse = mean_squared_error(y_val['predicted'], y_val['actual'])
print(f"DistilBERT model training MSE = {train_mse:.6f}")
print(f"DistilBERT model validation MSE = {valid_mse:.6f}")

In [None]:
t = batch[0]
t.shape

In [None]:

training_losses = [epoch_stats['Training Loss'] for epoch_stats in training_stats]
validation_losses = [epoch_stats['Valid. Loss'] for epoch_stats in training_stats]
plt.plot(range(1,len(training_losses)+1), training_losses, c='r')
plt.plot(range(1,len(validation_losses)+1), validation_losses, c='b')
plt.xticks(range(1, len(training_losses)+1))
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

In [None]:
matplotlib.rc('figure', figsize=(15,4))
_, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot([0,1], [0,1], transform=ax1.transAxes, c='k', alpha=0.2)
ax1.scatter(y_train['actual'], y_train['predicted'], c='b', alpha=0.2)
ax1.set_title("Training data")
ax1.set_xlabel("actual")
ax1.set_ylabel("predicted")
ax2.plot([0,1], [0,1], transform=ax2.transAxes, c='k', alpha=0.2)
ax2.scatter(y_val['actual'], y_val['predicted'], c='g', alpha=0.4)
ax2.set_title("Validation data")
ax2.set_xlabel("actual")
ax2.set_ylabel("predicted")