# Obtaining GPT-2 based Feature Representation

### 1. Imports and GPU set up

In [1]:
from datasets import load_dataset
from tqdm.notebook import tqdm
from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification, \
    DataCollatorWithPadding, TrainingArguments, set_seed
import torch
from torch.nn.functional import one_hot
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
set_seed(42)

### 2. Pre-Processing
- Get SNLI Dataset (Train fold) and shuffle it
- One-hot encoding for labels
- Remove instances without gold standard labels, i.e., label = -1
- Partition data 10%/90%; store the 10% as `warmup`
- Tokenise warmup

In [2]:
snli_train = load_dataset('snli', split = 'train').shuffle(seed = 42)
snli_train = snli_train.filter(lambda x: x['label'] != -1).map( \
    lambda x: {'label': one_hot(torch.tensor(x['label']), 3).type(torch.float32).numpy()}, \
    batched = True)
warmup = snli_train.select(range(0, int(len(snli_train)/10)))

Reusing dataset snli (/home/shana92/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Loading cached shuffled indices for dataset at /home/shana92/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-3c48a07b49c48dd6.arrow
Loading cached processed dataset at /home/shana92/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-ed3322adf0d6443a.arrow


  0%|          | 0/550 [00:00<?, ?ba/s]

In [3]:
# padding to left because GPT2 uses last token for prediction
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", padding_side = 'left', \
                                              padding = True, truncation = True)
tokenizer.pad_token = tokenizer.eos_token # pad with 'eos' token

In [4]:
# tokenize data
warmup = warmup.map(lambda x: tokenizer(x['premise'] + '|' + x['hypothesis']))

  0%|          | 0/54936 [00:00<?, ?ex/s]

In [5]:
# keep only needed columns, set data format to PyTorch
warmup.set_format(type = 'torch', columns = ['label', 'input_ids', 'attention_mask'])

### 3. Obtain Feature Representation:

In [6]:
# set up data collator - https://huggingface.co/docs/transformers/main_classes/data_collator
# this is a (callable) helper object that sends batches of data to the model
data_collator = DataCollatorWithPadding(tokenizer, padding = 'max_length', \
                                         return_tensors = 'pt', max_length = 120)

In [7]:
# set up GPT2
model = GPT2ForSequenceClassification.from_pretrained("gpt2", 
                                  num_labels=3,
                                  problem_type="multi_label_classification")
model.resize_token_embeddings(len(tokenizer)) # Resize input token embeddings matrix if num_tokens != config.vocab_size. - Source: HuggingFace
model.config.pad_token_id = model.config.eos_token_id # specify pad_token used by tokenizer

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# set up a dataloader (batch generator)
dataloader = torch.utils.data.DataLoader(warmup, batch_size=92, \
                                         shuffle=True, collate_fn=data_collator)

In [9]:
# move model to device
model.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

In [10]:
# set up loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [11]:
# Train
model.train()

size = len(dataloader.dataset)

for epoch in range(3):

    for batch, data in tqdm(enumerate(dataloader), total = len(dataloader)):

            # Torch requirement
            model.zero_grad()

            # Compute prediction and loss
            outputs = model(**data.to(device))
            loss = outputs[0]

            # Backpropagation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            if batch % int(len(dataloader)/10) == 0:
                loss, current = loss.item(), batch * len(data['labels'])
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

  0%|          | 0/598 [00:00<?, ?it/s]

loss: 1.730098  [    0/54936]
loss: 0.637217  [ 5428/54936]
loss: 0.611745  [10856/54936]
loss: 0.653839  [16284/54936]
loss: 0.631255  [21712/54936]
loss: 0.645865  [27140/54936]
loss: 0.631544  [32568/54936]
loss: 0.615854  [37996/54936]
loss: 0.604336  [43424/54936]
loss: 0.598106  [48852/54936]
loss: 0.596911  [54280/54936]


  0%|          | 0/598 [00:00<?, ?it/s]

loss: 0.601430  [    0/54936]
loss: 0.596676  [ 5428/54936]
loss: 0.619320  [10856/54936]
loss: 0.575719  [16284/54936]
loss: 0.624625  [21712/54936]
loss: 0.571945  [27140/54936]
loss: 0.587323  [32568/54936]
loss: 0.537397  [37996/54936]
loss: 0.518878  [43424/54936]
loss: 0.565910  [48852/54936]
loss: 0.546594  [54280/54936]


  0%|          | 0/598 [00:00<?, ?it/s]

loss: 0.522158  [    0/54936]
loss: 0.504827  [ 5428/54936]
loss: 0.583590  [10856/54936]
loss: 0.526870  [16284/54936]
loss: 0.526867  [21712/54936]
loss: 0.506098  [27140/54936]
loss: 0.450357  [32568/54936]
loss: 0.444920  [37996/54936]
loss: 0.509745  [43424/54936]
loss: 0.424810  [48852/54936]
loss: 0.433962  [54280/54936]


In [12]:
# Save model
torch.save(model.state_dict(), 'feature_rep.pth')

In [13]:
# check loss
loss.item()

0.36332187056541443

The feature representation $\Phi$ to be extracted (see report) is the saved model with the final linear layer removed