# Obtaining GPT-2 based Feature Representation

### 1. Imports and Global Settings

In [1]:
from datasets import load_dataset, disable_caching
from tqdm.notebook import tqdm
from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification, \
    DataCollatorWithPadding, set_seed
import torch
from torch.nn.functional import one_hot
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
set_seed(42)
disable_caching()

### 2. Pre-Processing
- Get SNLI Dataset (Train fold) and shuffle it
- Remove instances without gold standard labels, i.e., label = -1
- One-hot encoding for labels
- Partition data 10%/90%; store the 10% as `warmup`
- Tokenise `warmup`

In [2]:
snli_train = load_dataset('snli', split = 'train').shuffle(seed = 42)
snli_train = snli_train.filter(lambda x: x['label'] != -1).map( \
    lambda x: {'label': one_hot(torch.tensor(x['label']), 3).type(torch.float32).numpy()}, \
    batched = True)
warmup = snli_train.select(range(0, int(len(snli_train)/10)))

Reusing dataset snli (/home/shana92/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/551 [00:00<?, ?ba/s]

  0%|          | 0/550 [00:00<?, ?ba/s]

In [3]:
# set up tokeniser
# padding to left because GPT2 uses last token for prediction
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", padding_side = 'left', \
                                              padding = True, truncation = True)
tokenizer.pad_token = tokenizer.eos_token # pad with 'eos' token

In [4]:
# tokenize data
warmup = warmup.map(lambda x: tokenizer(x['premise'] + '|' + x['hypothesis']))

  0%|          | 0/54936 [00:00<?, ?ex/s]

In [5]:
# keep only needed columns, set data format to PyTorch
warmup.set_format(type = 'torch', columns = ['label', 'input_ids', 'attention_mask'])

### 3. Obtain Feature Representation:

In [6]:
# set up data collator - https://huggingface.co/docs/transformers/main_classes/data_collator
# this is a (callable) helper object that sends batches of data to the model
data_collator = DataCollatorWithPadding(tokenizer, padding = 'max_length', \
                                         return_tensors = 'pt', max_length = 512)

In [7]:
# set up GPT2
model = GPT2ForSequenceClassification.from_pretrained("gpt2", 
                                  num_labels=3,
                                  problem_type="multi_label_classification")
model.config.pad_token_id = model.config.eos_token_id # specify pad_token used by tokenizer

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# set up a dataloader (batch generator)
dataloader = torch.utils.data.DataLoader(warmup, batch_size=16, \
                                 shuffle=True, collate_fn=data_collator) # batch size constrained by GPU memory

In [9]:
# move model to device
model.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

In [10]:
# set up optimizer (loss function in-built)
# `lr` set to match Le et al. (2020) - https://arxiv.org/abs/2002.04108
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [11]:
# Train
model.train()

size = len(dataloader.dataset)

for epoch in range(3):

    for batch, data in tqdm(enumerate(dataloader), total = len(dataloader)):

            # Torch requirement
            model.zero_grad()

            # Compute prediction and loss
            outputs = model(**data.to(device))
            loss = outputs[0]

            # Backpropagation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            if batch % int(len(dataloader)/10) == 0:
                loss, current = loss.item(), batch * len(data['labels'])
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

  0%|          | 0/3434 [00:00<?, ?it/s]

loss: 1.241931  [    0/54936]
loss: 0.697172  [ 5488/54936]
loss: 0.623014  [10976/54936]
loss: 0.630340  [16464/54936]
loss: 0.613239  [21952/54936]
loss: 0.606668  [27440/54936]
loss: 0.640213  [32928/54936]
loss: 0.680190  [38416/54936]
loss: 0.599496  [43904/54936]
loss: 0.522306  [49392/54936]
loss: 0.563130  [54880/54936]


  0%|          | 0/3434 [00:00<?, ?it/s]

loss: 0.516606  [    0/54936]
loss: 0.563569  [ 5488/54936]
loss: 0.460693  [10976/54936]
loss: 0.397880  [16464/54936]
loss: 0.636643  [21952/54936]
loss: 0.513150  [27440/54936]
loss: 0.662384  [32928/54936]
loss: 0.456836  [38416/54936]
loss: 0.380365  [43904/54936]
loss: 0.582676  [49392/54936]
loss: 0.435975  [54880/54936]


  0%|          | 0/3434 [00:00<?, ?it/s]

loss: 0.453138  [    0/54936]
loss: 0.625132  [ 5488/54936]
loss: 0.474285  [10976/54936]
loss: 0.448634  [16464/54936]
loss: 0.325767  [21952/54936]
loss: 0.472658  [27440/54936]
loss: 0.497315  [32928/54936]
loss: 0.473477  [38416/54936]
loss: 0.376416  [43904/54936]
loss: 0.632152  [49392/54936]
loss: 0.390894  [54880/54936]


In [12]:
# Save model
torch.save(model, 'feature_rep.pth')

In [13]:
# check last batch loss
loss.item()

0.3114522397518158

The feature representation $\Phi$ to be extracted (see report) is the saved model with the final linear layer removed