<a href="https://colab.research.google.com/github/rohrl/llm_shenanigans/blob/main/soft_prompts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# based on https://github.com/kipgparker/soft-prompt-tuning/blob/main/example.ipynb

In [10]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [11]:
import torch
import torch.nn as nn

In [12]:
class SoftEmbedding(nn.Module):
    def __init__(self,
                wte: nn.Embedding,
                n_tokens: int = 10,
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        """appends learned embedding to

        Args:
            wte (nn.Embedding): original transformer word embedding
            n_tokens (int, optional): number of tokens for task. Defaults to 10.
            random_range (float, optional): range to init embedding (if not initialize from vocab). Defaults to 0.5.
            initialize_from_vocab (bool, optional): initalizes from default vocab. Defaults to True.
        """
        super(SoftEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                               n_tokens,
                                                                               random_range,
                                                                               initialize_from_vocab))

    def initialize_embedding(self,
                             wte: nn.Embedding,
                             n_tokens: int = 10,
                             random_range: float = 0.5,
                             initialize_from_vocab: bool = True):
        """initializes learned embedding

        Args:
            same as __init__

        Returns:
            torch.float: initialized using original schemes
        """
        if initialize_from_vocab:
            # this takes first n_tokens words from vocab and uses as init of learnt embeddings
            return self.wte.weight[:n_tokens].clone().detach()
        return torch.FloatTensor(n_tokens, wte.weight.size(1)).uniform_(-random_range, random_range)

    def forward(self, tokens):
        """run forward pass

        Args:
            tokens (torch.long): input tokens before encoding

        Returns:
            torch.float: encoding of text concatenated with learned task specifc embedding
        """
        # below line means that first n_tokens tokens will be ignored (?)
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)

In [13]:
n_tokens = 3 # 20
initialize_from_vocab = False  # True

In [14]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [31]:
model.get_input_embeddings()

Embedding(50257, 768)

In [32]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [33]:
s_wte = SoftEmbedding(model.get_input_embeddings(),
                      n_tokens=n_tokens,
                      initialize_from_vocab=initialize_from_vocab)

In [34]:
s_wte

SoftEmbedding(
  (wte): Embedding(50257, 768)
)

In [35]:
model.set_input_embeddings(s_wte)

In [76]:
inputs = tokenizer("Lorem", return_tensors="pt")


In [77]:
inputs

{'input_ids': tensor([[   43, 29625]]), 'attention_mask': tensor([[1, 1]])}

In [78]:
tokenizer.decode(inputs.input_ids.squeeze(), skip_special_tokens=False)

'Lorem'

In [79]:

# need to pad attention_mask and input_ids to be full seq_len + n_learned_tokens
# even though it does not matter what you pad input_ids with, it's just to make HF happy
# more exp: the SoftEmbedding implementation ignores first n_tokens of input tokens so this padding is to insert them at the beginning (and also make consistent with attention_mask length)
# Padding is made of repeated "unk_token" (but it doesn't matter as it's ignored)
inputs['input_ids'] = torch.cat([torch.full((1,n_tokens), tokenizer.unk_token_id), inputs['input_ids']], 1)
inputs['attention_mask'] = torch.cat([torch.full((1,n_tokens), 1), inputs['attention_mask']], 1)


In [80]:
print(inputs)
print(tokenizer.decode(inputs.input_ids.squeeze(), skip_special_tokens=False))

{'input_ids': tensor([[50256, 50256, 50256,    43, 29625]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
<|endoftext|><|endoftext|><|endoftext|>Lorem


In [81]:
# outputs = model(**inputs)

new_out_tokens = 10
curr_inputs = inputs

new_token_id = 0
outputs = torch.cat([inputs.input_ids, torch.full((1, new_out_tokens), 0) ], 1)


model.eval()
with torch.no_grad():
  for i in range(new_out_tokens):

    # outputs = model.generate(**inputs, max_length = curr_inputs.input_ids.size(1) + 1)
    raw_outputs = model(**curr_inputs)
    # print(raw_outputs.logits.shape)

    # new_token_id = outputs.squeeze()[-1]
    new_token_id = raw_outputs.logits[:,-1,:].argmax(axis=-1).item()
    outputs[:, (-new_out_tokens+i)] = new_token_id
    # print(outputs)

    # add the new token to inputs and repeat
    curr_inputs['input_ids'] = torch.cat([curr_inputs['input_ids'], torch.full((1, 1), new_token_id)], 1)
    curr_inputs['attention_mask'] = torch.cat([curr_inputs['attention_mask'], torch.full((1,1), 1)], 1)



In [82]:
# print(outputs.logits.shape)
print(outputs)

predicted_token_ids = outputs.squeeze()

tensor([[50256, 50256, 50256,    43, 29625,   220,  2419,   388,    13,   198,
           198,   464,   717,   318,   262]])


In [83]:
text = tokenizer.decode(predicted_token_ids, skip_special_tokens=False) #[0]

# Print the decoded text
print(f"|{text}|")

|<|endoftext|><|endoftext|><|endoftext|>Lorem ipsum.

The first is the|


## Training

In [127]:
import torch
import torch.nn as nn
import torch.optim as optim

target = "Lorem ipsum ble."

target_tokens = tokenizer(target, return_tensors="pt")
target_len = target_tokens.input_ids.size(1)

print(target_tokens.input_ids)

# create the batch by repeating tokens, then in the loop mask endings

target_tokens.input_ids = target_tokens.input_ids.repeat(target_len - 1, 1)
target_tokens.attention_mask = target_tokens.attention_mask.repeat(target_len - 1, 1)

# labels will be the next token, so clone and left-shift 1 hop
labels = target_tokens.input_ids.clone()
labels = labels.roll(-1, dims=-1)

# add masks
for i in range(target_len - 1):
  # pad right of i
  #labels.append(target_tokens.input_ids[i, i+1].item())
  target_tokens.input_ids[i, i+1:] = torch.full((1, target_len - i - 1), tokenizer.unk_token_id)
  target_tokens.attention_mask[i, i+1:] = torch.full((1, target_len - i - 1), 0)

# last token will never be fed as input so trim all tensors
target_tokens.input_ids = target_tokens.input_ids[:, :-1]
target_tokens.attention_mask = target_tokens.attention_mask[:, :-1]
labels = labels[:, :-1]

print(target_tokens.input_ids)
print(target_tokens.attention_mask)
print(labels)


tensor([[   43, 29625,   220,  2419,   388,  7245,    13]])
tensor([[   43, 50256, 50256, 50256, 50256, 50256],
        [   43, 29625, 50256, 50256, 50256, 50256],
        [   43, 29625,   220, 50256, 50256, 50256],
        [   43, 29625,   220,  2419, 50256, 50256],
        [   43, 29625,   220,  2419,   388, 50256],
        [   43, 29625,   220,  2419,   388,  7245]])
tensor([[1, 0, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1]])
tensor([[29625,   220,  2419,   388,  7245,    13],
        [29625,   220,  2419,   388,  7245,    13],
        [29625,   220,  2419,   388,  7245,    13],
        [29625,   220,  2419,   388,  7245,    13],
        [29625,   220,  2419,   388,  7245,    13],
        [29625,   220,  2419,   388,  7245,    13]])


In [88]:
# freeze entire model, then unfreeze soft embeddings
model.requires_grad_(False)
s_wte.requires_grad_(True)

SoftEmbedding(
  (wte): Embedding(50257, 768)
)

In [95]:


model.train()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
for epoch in range(100):
    # Forward pass
    outputs = model(input_ids=input_ids, attention_mask=attn_masks)

    # Calculate the loss

    # TODO: ignore padding in loss
    loss = criterion(outputs.logits[:, -1, :], labels)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the loss
    if epoch % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, 100, loss.item()))


ValueError: Expected input batch_size (3) to match target batch_size (2).