<a href="https://colab.research.google.com/github/shaangao/LLMs/blob/main/llms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLM Feature Extraction and Fine-Tuning for Sentence Classification

In [1]:
%%capture
! pip install tqdm boto3 requests regex sentencepiece sacremoses
! pip install transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import collections
import json
import numpy as np
import torch
import torch.nn as nn
import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

## dataset and data loaders

In [4]:
SPLITS = ['train', 'dev', 'test']


class DBPediaDataset(Dataset):

    '''
    DBPedia dataset.
      Args:
        path[str]: path to the original data.
    '''

    def __init__(self, path):
        with open(path) as fin:
            self._data = [json.loads(l) for l in fin]
        self._n_classes = len(set([datum['label'] for datum in self._data]))

    def __getitem__(self, index):
        return self._data[index]

    def __len__(self):
        return len(self._data)

    @property
    def n_classes(self):
        return self._n_classes

    @staticmethod
    def collate_fn(tokenizer, device, batch):

        '''
        The collate function that compresses a training batch.
          Args:
            batch[list[dict[str, Any]]]: data in the batch.
          Returns:
            labels[torch.LongTensor]: the labels in the batch.
            sentences[dict[str, torch.Tensor]]: sentences converted by tokenizers.
        '''

        labels = torch.tensor([datum['label'] for datum in batch]).long().to(device)
        sentences = tokenizer(
            [datum['sentence'] for datum in batch],
            return_tensors='pt',  # pt = pytorch style tensor
            padding=True
        )
        for key in sentences:
            sentences[key] = sentences[key].to(device)
        return labels, sentences


def construct_datasets(prefix, batch_size, tokenizer, device):

    '''
    Constructs datasets and data loaders.
      Args:
        prefix[str]: prefix of the dataset (e.g., dbpedia_).
        batch_size[int]: maximum number of examples in a batch.
        tokenizer: model tokenizer that converts sentences to integer tensors.
        device[torch.device]: the device (cpu/gpu) that the tensor should be on.
      Returns:
        datasets[dict[str, Dataset]]: a dict of constructed datasets.
        dataloaders[dict[str, DataLoader]]: a dict of constructed data loaders.
    '''

    datasets = collections.defaultdict()
    dataloaders = collections.defaultdict()

    for split in SPLITS:

        # dataset
        datasets[split] = DBPediaDataset(f'{prefix}{split}.json')

        # dataloader
        dataloaders[split] = DataLoader(
            datasets[split],
            batch_size=batch_size,
            shuffle=(split == 'train'),
            collate_fn=lambda x:DBPediaDataset.collate_fn(tokenizer, device, x)
        )

    return datasets, dataloaders

## classifer architecture

In [5]:
class Classifier(nn.Module):


    def __init__(self, in_size, layer_sizes:list, layer_acts:list):

        # call parent constructor
        super(Classifier, self).__init__()

        # construct layers (last layer is output layer)
        self.layers = nn.ModuleList()
        for i, layer_size in enumerate(layer_sizes):
            if i == 0:
                layer = nn.Linear(in_size, layer_size)
                layer.weight.data.uniform_(-0.01, 0.01)
                layer.bias.data.zero_()
                self.layers.append(layer)
            else:
                layer = nn.Linear(layer_sizes[i-1], layer_size)
                layer.weight.data.uniform_(-0.01, 0.01)
                layer.bias.data.zero_()
                self.layers.append(layer)

        # set each layer's activation function
        self.layer_acts = layer_acts


    def forward(self, x):

        for i, layer in enumerate(self.layers):
            x = layer(x)
            x = self.layer_acts[i](x)

        return x

## 1. BERT [CLS] feature extraction for classification

### setup

In [6]:
# set hyperparameters
batch_size = 32
classifier_hidden_size = 32

# load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model = AutoModel.from_pretrained('bert-base-cased')
if torch.cuda.is_available():  # use GPU if available
    bert_model = bert_model.cuda()

# construct datasets with BERT tokenizer
datasets, dataloaders = construct_datasets(
    prefix='/content/drive/MyDrive/LLMs/data/dbpedia_',
    batch_size=batch_size,
    tokenizer=tokenizer,
    device=bert_model.device
)

# # sanity check
# datasets['train'].__getitem__(0)

# # sanity check
# dtrain0 = next(iter(dataloaders['train']))
# # labels
# dtrain0[0]
# # sentences
# dtrain0[1].keys()

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

### train and eval util funcs

In [7]:
# func for extracting frozen BERT token representations for sentences and pooling

def extract_bert_rep(bert_model, sentences:dict, pooling:str):

    # extract frozen BERT token representations for sentences
    with torch.no_grad():   # keep BERT params fixed
        unpooled_features = bert_model(**sentences)['last_hidden_state']  # (B, L, D): (batch_size, num token in sentence, BERT rep dimension)

    # get pooled_features across tokens for each sentence
    if pooling == 'first':
        pooled_features = unpooled_features[:, 0, :]   # (B, D)
    elif pooling == 'mean':
        pass
    elif pooling == 'max':
        pass

    return pooled_features

In [8]:
# func for running 1 epoch of training

def train1epoch(classifier, train_dataloader, optimizer, criterion):

    # progress bar
    pbar = tqdm.tqdm(train_dataloader)

    # turn on training mode
    classifier.train()

    # reset epoch_loss tracker
    epoch_loss = 0

    # iter through mini-batches to train 1 epoch
    for labels, sentences in pbar:

        # extract frozen BERT token representations for sentences; get [CLS] token (first token) representation
        cls_features = extract_bert_rep(bert_model=bert_model, sentences=sentences, pooling='first')   # (B, D)

        # train
        optimizer.zero_grad()   # zero the gradient buffers
        output = classifier(cls_features)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()   # does the update

        epoch_loss += loss.item()

        # print('\n')
        # print(f'  batch loss: {loss.item()}')
        # print(f'  epoch loss: {epoch_loss}')

    return epoch_loss

In [9]:
# eval

def eval(classifier, eval_dataloader):

    # progress bar
    pbar = tqdm.tqdm(eval_dataloader)

    # turn on eval mode
    classifier.eval()

    # set trackers
    ycorrect = 0
    ytotal = 0

    # turn off gradient calc to reduce memory consumption
    with torch.no_grad():

        # iter through mini-batches in eval_dataloader
        for labels, sentences in pbar:

            # extract frozen BERT token representations for sentences; get [CLS] token (first token) representation
            cls_features = extract_bert_rep(bert_model=bert_model, sentences=sentences, pooling='first')  # (B, D)

            # get classifier predictions on eval_data
            probs = F.softmax(classifier(cls_features), dim=1)
            ypred = torch.argmax(probs, dim=1)

            # count correct predictions
            ycorrect += torch.sum(torch.eq(ypred, labels)).item()

            # total num of obs in eval_data
            ytotal += len(labels)

    # compute accuracy
    yaccu = ycorrect / ytotal

    return yaccu

In [14]:
# wrapper for train & eval

def main_process(classifier, name, optimizer, criterion, dataloaders, max_epochs=10, early_stopping=3):


    # initialize vars: track metrics
    epoch_losses = []
    train_evals = []
    dev_evals = []

    # initialize vars: track best classifier
    best_dev_eval = 0
    best_classifier_epoch = -1


    # train and eval
    for epoch in range(max_epochs):

        print(f'EPOCH {epoch+1}')

        # train
        print(f'----- TRAIN -----')
        epoch_loss = train1epoch(
            classifier=classifier,
            train_dataloader=dataloaders['train'],
            optimizer=optimizer,
            criterion=criterion
        )
        print(f'  epoch loss: {epoch_loss}')
        epoch_losses.append(epoch_loss)

        print(f'----- EVAL -----')

        # # eval on training set
        # train_eval = eval(classifier=classifier, eval_dataloader=dataloaders['train'])
        # print(f'  train accuracy: {train_eval}')
        # train_evals.append(train_eval)

        # eval on dev set
        dev_eval = eval(classifier=classifier, eval_dataloader=dataloaders['dev'])
        print(f'  dev accuracy: {dev_eval}')
        dev_evals.append(dev_eval)

        # update best classifier based on dev eval
        if dev_eval > best_dev_eval:

            # save state_dict of best classifier so far
            torch.save(classifier.state_dict(), '/content/drive/MyDrive/LLMs/classifiers/'+name+'_best.pth.tar')

            # update which epoch best_classifier is from
            best_classifier_epoch = epoch

            # update best_dev_accu
            best_dev_eval = dev_eval

        print(f'  best classifier from epoch {best_classifier_epoch+1}')

        # early stopping based on dev eval
        if early_stopping is not None:
            if epoch - best_classifier_epoch >= early_stopping:
                print('=== EARLY STOPPING ===')
                break
    # end training epochs


    # load state_dict of best classifier (modifies input classifier in place)
    if epoch != best_classifier_epoch:
        print(f'load best classifier...')
        classifier.load_state_dict(torch.load('/content/drive/MyDrive/LLMs/classifiers/'+name+'_best.pth.tar'))

    # eval best classifier on devtest set
    print(f'----- EVAL -----')
    print(f'eval best classifier on devtest...')
    devtest_eval = eval(classifier=classifier, eval_dataloader=dataloaders['test'])
    print(f'devtest accuracy: {devtest_eval}\n')


    return epoch_losses, train_evals, dev_evals, best_dev_eval, devtest_eval


### run experiments

In [16]:
# run 1 epoch of training 5 times with random seeds


# specify seeds for each run
seeds = [42, 645, 234, 534, 56]

# for storing dev and devtest accuracy for each run
final_dev_evals = []
devtest_evals = []


for i, seed in enumerate(seeds):

    print(f'======== RUN {i+1} ======== ')

    # set random seed: https://pytorch.org/docs/stable/notes/randomness.html
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # instantiate classifier for current run
    classifier = Classifier(
        in_size=bert_model.config.hidden_size,
        layer_sizes=[classifier_hidden_size, datasets['train'].n_classes],
        layer_acts=[nn.ReLU(), nn.Identity()]
    ).to(bert_model.device)

    # instantiate optimizer for current run
    optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)

    # set loss function
    loss_func = nn.CrossEntropyLoss()

    # train and eval current classifier
    _, _, _, final_dev_eval, devtest_eval = main_process(
        classifier=classifier,
        name='seed'+str(seed),
        optimizer=optimizer,
        criterion=loss_func,
        dataloaders=dataloaders,
        max_epochs=1,
        early_stopping=3
    )
    final_dev_evals.append(final_dev_eval)
    devtest_evals.append(devtest_eval)

EPOCH 1
----- TRAIN -----


100%|██████████| 313/313 [00:41<00:00,  7.51it/s]


  epoch loss: 533.342753469944
----- EVAL -----


100%|██████████| 32/32 [00:03<00:00,  8.21it/s]


  dev accuracy: 0.886
  best classifier from epoch 1
----- EVAL -----
eval best classifier on devtest...


100%|██████████| 32/32 [00:03<00:00,  8.88it/s]


devtest accuracy: 0.878

EPOCH 1
----- TRAIN -----


100%|██████████| 313/313 [00:41<00:00,  7.57it/s]


  epoch loss: 528.4443386793137
----- EVAL -----


100%|██████████| 32/32 [00:03<00:00,  8.67it/s]


  dev accuracy: 0.84
  best classifier from epoch 1
----- EVAL -----
eval best classifier on devtest...


100%|██████████| 32/32 [00:03<00:00,  8.77it/s]


devtest accuracy: 0.831

EPOCH 1
----- TRAIN -----


100%|██████████| 313/313 [00:41<00:00,  7.61it/s]


  epoch loss: 541.7327680587769
----- EVAL -----


100%|██████████| 32/32 [00:03<00:00,  8.64it/s]


  dev accuracy: 0.777
  best classifier from epoch 1
----- EVAL -----
eval best classifier on devtest...


100%|██████████| 32/32 [00:03<00:00,  8.76it/s]


devtest accuracy: 0.775

EPOCH 1
----- TRAIN -----


100%|██████████| 313/313 [00:41<00:00,  7.59it/s]


  epoch loss: 525.1410391926765
----- EVAL -----


100%|██████████| 32/32 [00:03<00:00,  8.76it/s]


  dev accuracy: 0.86
  best classifier from epoch 1
----- EVAL -----
eval best classifier on devtest...


100%|██████████| 32/32 [00:03<00:00,  8.70it/s]


devtest accuracy: 0.864

EPOCH 1
----- TRAIN -----


100%|██████████| 313/313 [00:41<00:00,  7.55it/s]


  epoch loss: 585.637094438076
----- EVAL -----


100%|██████████| 32/32 [00:03<00:00,  8.76it/s]


  dev accuracy: 0.805
  best classifier from epoch 1
----- EVAL -----
eval best classifier on devtest...


100%|██████████| 32/32 [00:03<00:00,  8.75it/s]

devtest accuracy: 0.809






In [17]:
print(final_dev_evals)
print(devtest_evals)

[0.886, 0.84, 0.777, 0.86, 0.805]
[0.878, 0.831, 0.775, 0.864, 0.809]


In [18]:
dev_accu_mean = np.mean(final_dev_evals)
dev_accu_std = np.std(final_dev_evals)

print(
f'Across the 5 runs, the mean accuracy on the dev set is {round(dev_accu_mean, 4)}, \
with a standard deviation of {round(dev_accu_std, 4)}. \
\nThe best-performing classifier (w.r.t. dev set accuracy) has an accuracy of \
{round(devtest_evals[np.argmax(final_dev_evals)], 4)} on the test set.'
)

Across the 5 runs, the mean accuracy on the dev set is 0.8336, with a standard deviation of 0.0387. 
The best-performing classifier (w.r.t. dev set accuracy) has an accuracy of 0.878 on the test set.


## 2. mean-pooling and max-pooling across tokens

## 3. comparing pooling techniques

## 4. fine-tuning BERT with [CLS] features

In [None]:
# set hyperparameters
batch_size = 32
classifier_hidden_size = 32

# instantiate classifier
classifier = Classifier(
    bert_model.config.hidden_size,
    classifier_hidden_size,
    datasets['train'].n_classes
).to(bert_model.device)


params = list()
for name, param in bert_model.named_parameters():
    if name.startswith... # 1.3: [CODE] this line is incomplete, you can finish this line by adding the last two layers' parameters to "params", or re-write your own code
        params.append(param)

# instantiate optimizer
optimizer = torch.optim.Adam(params + list(classifier.parameters()), lr=5e-4)

# set loss
loss_func = nn.CrossEntropyLoss()

# progress bar
pbar = tqdm.tqdm(dataloaders['train'])

# Finish your code here for 1.4. You may re-used most of your code for 1.1.

## 5. GPT-2