<a href="https://colab.research.google.com/github/rohrl/llm_shenanigans/blob/main/soft_prompts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# based on https://github.com/kipgparker/soft-prompt-tuning/blob/main/example.ipynb

In [1]:
!pip install sentencepiece transformers accelerate einops



In [2]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, AutoModelForCausalLM, AutoTokenizer

In [3]:
import torch
import torch.nn as nn

In [4]:
torch.set_default_device('cuda')

In [5]:


# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained('gpt2', device_map="cuda")

# model_id = "itsliupeng/llama2_7b_mmlu"
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda")
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", device_map="cuda", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
model.device

device(type='cuda', index=0)

## Sanity check

In [7]:
def create_empty_input():
    return {'input_ids': torch.empty(size=(1,0)).to(torch.int64), 'attention_mask': torch.empty(size=(1,0)).to(torch.int64)}

In [8]:
sanity_text = "The capital of Australia is"
sanity_output = model.generate(input_ids = tokenizer.encode(sanity_text, return_tensors="pt"), max_length=15, num_return_sequences=1)
print("*******************************************\n" +
      tokenizer.decode(sanity_output[0], skip_special_tokens=True) +
      "\n*******************************************\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


*******************************************
The capital of Australia is Canberra.
    How many more tickets should Is
*******************************************



In [9]:
# inputs = create_empty_input()
# model.generate(inputs['input_ids'], max_length=10, num_return_sequences=1)
# print("==================\n" + tokenizer.decode(sanity_output[0], skip_special_tokens=True) + "\n==================\n")

## Soft Embeddings

In [10]:
class SoftEmbedding(nn.Module):
    def __init__(self,
                wte: nn.Embedding,
                n_tokens: int = 10,
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        """appends learned embedding to

        Args:
            wte (nn.Embedding): original transformer word embedding
            n_tokens (int, optional): number of tokens for task. Defaults to 10.
            random_range (float, optional): range to init embedding (if not initialize from vocab). Defaults to 0.5.
            initialize_from_vocab (bool, optional): initalizes from default vocab. Defaults to True.
        """
        super(SoftEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                               n_tokens,
                                                                               random_range,
                                                                               initialize_from_vocab))

    def initialize_embedding(self,
                             wte: nn.Embedding,
                             n_tokens: int = 10,
                             random_range: float = 0.5,
                             initialize_from_vocab: bool = True):
        """initializes learned embedding

        Args:
            same as __init__

        Returns:
            torch.float: initialized using original schemes
        """
        if initialize_from_vocab:
            # this takes first n_tokens words from vocab and uses as init of learnt embeddings
            return self.wte.weight[:n_tokens].clone().detach()
        # .half() is needed for Phi2
        return torch.FloatTensor(n_tokens, wte.weight.size(1)).uniform_(-random_range, random_range).half().to('cuda')

    def forward(self, tokens):
        """run forward pass

        Args:
            tokens (torch.long): input tokens before encoding

        Returns:
            torch.float: encoding of text concatenated with learned task specifc embedding
        """
        # below line means that first n_tokens tokens will be ignored (?)
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)

In [43]:
# How many soft prompt tokens do we want to use.
num_soft_prompt_tokens = 20
initialize_from_vocab = False  # True

In [44]:
model.get_input_embeddings()

SoftEmbedding(
  (wte): Embedding(51200, 2560)
)

In [45]:
tokenizer

CodeGenTokenizerFast(name_or_path='microsoft/phi-2', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50257: AddedToken("                               ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("                              ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("                             ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("                            ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("            

In [46]:
s_wte = SoftEmbedding(model.get_input_embeddings(),
                      n_tokens = num_soft_prompt_tokens,
                      initialize_from_vocab = initialize_from_vocab)

AttributeError: 'SoftEmbedding' object has no attribute 'weight'

In [15]:
s_wte

SoftEmbedding(
  (wte): Embedding(51200, 2560)
)

In [16]:
model.set_input_embeddings(s_wte)

In [17]:
def prepend_with_soft_prompts_padding(inputs, num_soft_tokens, pad_token_id = tokenizer.unk_token_id, labels = None):
    """
    Need to pad attention_mask and input_ids to be full seq_len + n_learned_tokens,
    even though it does not matter what you pad input_ids with, it's just to make HF happy.
    More exp: the SoftEmbedding implementation ignores first num_soft_prompt_tokens of input tokens so this padding is to insert them at the beginning (and also make consistent with attention_mask length)
    Padding is made of repeated "unk_token" (but it doesn't matter as it's ignored).
    """
    batch_size = inputs['input_ids'].size(0)

    inputs['input_ids'] = torch.cat([torch.full((batch_size, num_soft_tokens), pad_token_id), inputs['input_ids']], 1)
    inputs['attention_mask'] = torch.cat([torch.full((batch_size, num_soft_tokens), 1), inputs['attention_mask']], 1)

    if labels is None:
        return inputs
    else:
        labels = torch.cat([torch.full((batch_size, num_soft_tokens), pad_token_id), labels], 1)
        return inputs, labels


## Inference

In [34]:
model.eval()

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): SoftEmbedding(
      (wte): Embedding(51200, 2560)
    )
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (fi

In [35]:
inputs = tokenizer("The capital of Australia is", return_tensors="pt")
# inputs = create_empty_input()

In [36]:
inputs

{'input_ids': tensor([[ 464, 3139,  286, 4505,  318]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1]], device='cuda:0')}

In [37]:
tokenizer.decode(inputs['input_ids'].squeeze(), skip_special_tokens=False)

'The capital of Australia is'

In [38]:
inputs = prepend_with_soft_prompts_padding(inputs, num_soft_prompt_tokens)

print(inputs)
print(tokenizer.decode(inputs['input_ids'].squeeze(), skip_special_tokens=False))

{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           464,  3139,   286,  4505,   318]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0')}
<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>The capital of Australia is


In [39]:
# outputs = model(**inputs)

new_out_tokens = 10
curr_inputs = inputs

new_token_id = 0
outputs = torch.cat([inputs['input_ids'], torch.full((1, new_out_tokens), 0) ], 1)


model.eval()

with torch.no_grad():
  for i in range(new_out_tokens):

    # outputs = model.generate(**inputs, max_length = curr_inputs['input_ids'].size(1) + 1)
    raw_outputs = model(**curr_inputs)
    # print(raw_outputs.logits.shape)

    # new_token_id = outputs.squeeze()[-1]
    new_token_id = raw_outputs.logits[:,-1,:].argmax(axis=-1).item()
    outputs[:, (-new_out_tokens+i)] = new_token_id
    # print(outputs)

    # add the new token to inputs and repeat
    curr_inputs['input_ids'] = torch.cat([curr_inputs['input_ids'], torch.full((1, 1), new_token_id)], 1)
    curr_inputs['attention_mask'] = torch.cat([curr_inputs['attention_mask'], torch.full((1,1), 1)], 1)



In [40]:
# print(outputs.logits.shape)
print(outputs)

predicted_token_ids = outputs.squeeze()

tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           464,  3139,   286,  4505,   318, 43296,    13,   198,    32,  1585,
         23556,   531,    11,   366,   464]], device='cuda:0')


In [41]:
text = tokenizer.decode(predicted_token_ids, skip_special_tokens=True) #[0]

# Print the decoded text
print(f"|{text}|")

|The capital of Australia is Honolulu.
Aristotle said, "The|


## Training

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import copy

# this token is ignored in loss - used to mask remainder of output
ignored_token_id = tokenizer.unk_token_id

target = "The capital of Australia is Honolulu."

target_tokens = tokenizer(target, return_tensors="pt")
target_len = target_tokens['input_ids'].size(1)

print(target_tokens['input_ids'])

# create the batch by repeating tokens, then in the loop mask endings

target_tokens['input_ids'] = target_tokens['input_ids'].repeat(target_len - 1, 1)
target_tokens['attention_mask'] = target_tokens['attention_mask'].repeat(target_len - 1, 1)

# labels will be the next token, so clone and left-shift 1 hop
labels = target_tokens['input_ids'].clone()
labels = labels.roll(-1, dims=-1)

# add masks
for i in range(target_len - 1):
  # pad right of i
  #labels.append(target_tokens['input_ids'][i, i+1].item())
  target_tokens['input_ids'][i, i+1:] = torch.full((1, target_len - i - 1), ignored_token_id)
  labels[i, i+1:] = torch.full((1, target_len - i - 1), ignored_token_id)
  target_tokens['attention_mask'][i, i+1:] = torch.full((1, target_len - i - 1), 0)

# last token will never be fed as input so trim all tensors
target_tokens['input_ids'] = target_tokens['input_ids'][:, :-1]
target_tokens['attention_mask'] = target_tokens['attention_mask'][:, :-1]
labels = labels[:, :-1]

# Finally, pad inputs with soft prompts
target_tokens, labels = prepend_with_soft_prompts_padding(target_tokens, num_soft_prompt_tokens, labels = labels)


print(target_tokens['input_ids'])
print(target_tokens['attention_mask'])
print(labels)


tensor([[  464,  3139,   286,  4505,   318, 43296,    13]], device='cuda:0')
tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           464, 50256, 50256, 50256, 50256, 50256],
        [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           464,  3139, 50256, 50256, 50256, 50256],
        [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           464,  3139,   286, 50256, 50256, 50256],
        [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
           464,  3139,   286,  4505, 50256, 50256],
        [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
    

In [27]:
# freeze entire model, then unfreeze soft embeddings
model.requires_grad_(False)
s_wte.requires_grad_(True)

SoftEmbedding(
  (wte): Embedding(51200, 2560)
)

In [28]:
def compute_loss(criterion, logits, labels):

    logits_flat = logits.view(-1, logits.size(-1))
    # print(logits_flat.shape)

    labels_flat = labels.flatten()
    # print(labels_flat.shape)

    loss = criterion(logits_flat, labels_flat)

    loss_per_batch = loss.mean()

    return loss_per_batch

In [29]:
# Training loop.
# Note we don't need/want an eval set - unlike in typical training, we do want to overfit to train data as much as possible.

model.train()

criterion = nn.CrossEntropyLoss(ignore_index = ignored_token_id, reduction='none')
optimizer = optim.SGD(model.parameters(), lr=0.01) # TODO: Try Adam

best_loss = 1e9
best_soft_prompts = None

# Train the model
num_epochs = 1000

for epoch in range(num_epochs):
    # Forward pass
    outputs = model(input_ids = target_tokens['input_ids'], attention_mask = target_tokens['attention_mask'])

    # print(outputs.logits.shape) #, outputs.logits)
    # print(labels.shape, labels)

    loss = compute_loss(criterion, outputs.logits, labels)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Save best
    if loss.item() < best_loss:
        best_loss = loss.item()
        best_soft_prompts = copy.deepcopy(s_wte) # .clone() fails :(
        print('--- NEW BEST: Epoch: {}, Loss: {:.4f}'.format(epoch+1, best_loss))


    # Print the loss
    if epoch % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))


--- NEW BEST: Epoch: 1, Loss: 0.7091
Epoch [1/1000], Loss: 0.7091
--- NEW BEST: Epoch: 2, Loss: 0.6532
--- NEW BEST: Epoch: 7, Loss: 0.6437
--- NEW BEST: Epoch: 9, Loss: 0.6272
Epoch [11/1000], Loss: 0.6626
--- NEW BEST: Epoch: 17, Loss: 0.6266
Epoch [21/1000], Loss: 0.6369
--- NEW BEST: Epoch: 23, Loss: 0.6184
--- NEW BEST: Epoch: 25, Loss: 0.6179
--- NEW BEST: Epoch: 26, Loss: 0.6011
--- NEW BEST: Epoch: 31, Loss: 0.5834
Epoch [31/1000], Loss: 0.5834
Epoch [41/1000], Loss: 0.5905
--- NEW BEST: Epoch: 42, Loss: 0.5626
--- NEW BEST: Epoch: 51, Loss: 0.5397
Epoch [51/1000], Loss: 0.5397
--- NEW BEST: Epoch: 56, Loss: 0.5113
Epoch [61/1000], Loss: 0.5437
--- NEW BEST: Epoch: 71, Loss: 0.5093
Epoch [71/1000], Loss: 0.5093
--- NEW BEST: Epoch: 72, Loss: 0.5053
--- NEW BEST: Epoch: 78, Loss: 0.5028
--- NEW BEST: Epoch: 80, Loss: 0.4951
Epoch [81/1000], Loss: 0.5114
--- NEW BEST: Epoch: 89, Loss: 0.4835
Epoch [91/1000], Loss: 0.4921
Epoch [101/1000], Loss: 0.5043
--- NEW BEST: Epoch: 102, Lo

In [30]:
best_loss

0.0036146200727671385

In [None]:
best_loss

0.006296176929026842

In [31]:
best_soft_prompts

SoftEmbedding(
  (wte): Embedding(51200, 2560)
)

In [32]:
# Set the best soft prompts on the model.
model.set_input_embeddings(best_soft_prompts)

In [33]:
print("===============================================================================================")
print(">>>>> Now go back to Inference section and see what you get with trained soft prompts =] <<<<<<")
print("===============================================================================================")

>>>>> Now go back to Inference section and see what you get with trained soft prompts =] <<<<<<


In [None]:
# USING HF LIBRARY
# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(
#     output_dir="./model_checkpoints",  # Output directory for checkpoints
#     num_train_epochs=3,  # Total number of training epochs
#     per_device_train_batch_size=16,  # Batch size per device
#     per_device_eval_batch_size=16,  # Batch size for evaluation
#     warmup_steps=500,  # Number of warmup steps
#     logging_steps=100,  # Number of steps between logging
#     save_steps=1000,  # Number of steps between saving checkpoints
#     evaluation_strategy="steps",  # Evaluation strategy
#     eval_steps=1000,  # Number of steps between evaluations
# )

# trainer = Trainer(
#     model=model,  # The model to train
#     args=training_args,  # Training arguments
#     train_dataset=train_dataset,  # Training dataset
#     eval_dataset=eval_dataset,  # Evaluation dataset
# )

# trainer.train()