In [1]:
import torch
import seaborn as sns
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from datasets import load_dataset
import matplotlib.pyplot as plt
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load the dataset
dataset = load_dataset('bigscience/P3', 'cos_e_v1.11_aligned_with_common_sense')
train_dataset = dataset['train']

# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')
model_continuos = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')
model_projected = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_continuos.to(device)
model_projected.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [8]:
#lets define our prompt basis

#input embeddings shape: torch.Size([8, 82, 1024])
# combined embeddings shape: torch.Size([8, 102, 1024])
# outputs shape: torch.Size([8, 26, 50264])
# labels shape: torch.Size([8, 26])
# labels pretokenized: ['\nwebmath is designed to help you solve', '\nbums are well known to take up residence under bridges.', '\nthis word is most relavant', '\nst.paul is a county in minnesota', '\nif you need speed, corvette is the answer.', '\nwashington is the only place in the list that has pacific beaches', '\na big fountain was the center piece of the renovation, it had all been paid for by a grant to the city', '\nbeing ambitious means they will work hard to be good.']

prompt_list = [
    'When you see the following question, I would like you to answer it correctly', # ~13 tokens 
    'Produce an executable artifact of type X that will answer the question, and then execute it',
    'When I ask you a question, generate three additional questions that would help you give a more accurate answer. When you then answered the three questions, combine the answers to produce the final answers to my original question',
    'Generate a set of facts that are contained in the output. The set of facts should be inserted in a specific point in the output to answer the question',
]

basis = tokenizer(prompt_list, padding=True, truncation=True, return_tensors='pt').to(device)

print(f'prompt basis shape: {model_projected.model.shared(basis.input_ids).shape}')

prompt basis shape: torch.Size([4, 43, 1024])


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [13]:
# Define the FFNN architecture
class LearnWeights(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LearnWeights, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=-1)

In [14]:
# Define the soft prompt
L = 20
d = model_projected.model.shared.embedding_dim
soft_prompt = torch.randn(L, d).to(device)
soft_prompt = torch.nn.Parameter(soft_prompt)
optimizer_continuous = AdamW([soft_prompt])

#Define the projected prompt
input_dim = d  
hidden_dim = 64
output_dim = len(prompt_list)
learn_weights = LearnWeights(input_dim, hidden_dim, output_dim).to(device)
optimizer_projected = AdamW(learn_weights.parameters())

# Training parameters
epochs = 1
batch_size = 4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('starting training')

# Training loop
continuous_losses = []
projected_losses = []

for epoch in range(epochs):
    epoch_loss_continuous = 0
    epoch_loss_projected = 0
    for i in range(0, len(train_dataset) - 9000, batch_size):
        batch = train_dataset[i:i+batch_size]
        input_ids = tokenizer(batch['inputs_pretokenized'], return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
        labels = tokenizer(batch['targets_pretokenized'], return_tensors='pt', padding=True, truncation=True).input_ids.to(device)

        # Get the input embeddings
        continuous_input_embeddings = model_continuos.model.shared(input_ids)
        projected_input_embeddings = model_projected.model.shared(input_ids)

        soft_prompt_batch = soft_prompt.unsqueeze(0).repeat(combined_soft_embeddings.size(0), 1, 1).to(device)
        projected_prompt_batch = learn_weights(soft_prompt_batch)
        
        combined_soft_embeddings = torch.cat([soft_prompt_batch, combined_soft_embeddings], dim=1)
        combined_projected_embeddings = torch.cat([projected_prompt_batch, combined_projected_embeddings], dim=1)

        # Pass the combined embeddings through the model
        outputs_continous = model_continuos(inputs_embeds=combined_soft_embeddings, labels=labels)
        outputs_projected = model_projected(inputs_embeds=combined_projected_embeddings, labels=labels)
        # print(f'input embeddings shape: {input_embeddings.shape}')
        # print(f'combined embeddings shape: {combined_embeddings.shape}')
        # print(f'outputs shape: {outputs.logits.shape}')
        # print(f'labels shape: {labels.shape}')
        # # print(f'labels: {labels}')
        # print(f'labels pretokenized: {batch["targets_pretokenized"]}')
        loss_continuous = outputs_continous.loss
        epoch_loss_continuous += loss_continuous.item()
        
        loss_projected = outputs_projected.loss
        epoch_loss_projected += loss_projected.item()
        
        loss_continuous.backward()
        loss_projected.backward()
        
        optimizer_projected.step()
        optimizer_projected.zero_grad()
        print(f'\r complete from this epoch {i}/{len(train_dataset)}', end='')
        print(f'\r loss continous: {loss_continuous.item()}', end='')
        print(f'\r loss projected: {loss_projected.item()}', end='')


    epoch_loss_continuous /= len(train_dataset)
    epoch_loss_projected /= len(train_dataset)
    
    continuous_losses.append(epoch_loss_continuous)
    projected_losses.append(epoch_loss_projected)
    
    print(f'\r Epoch {epoch+1}/{epochs} complete. Loss: {epoch_loss_continuous}')



starting training


NameError: name 'combined_soft_embeddings' is not defined

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create a DataFrame with the loss values
data = {
    'Epoch': list(range(1, epochs + 1)) * 2,
    'Loss': continuous_losses + projected_losses,
    'Model': ['Continuous'] * epochs + ['Projected'] * epochs
}
df = pd.DataFrame(data)

# Create the plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Epoch', y='Loss', hue='Model')
plt.title('Loss per Epoch for Continuous and Projected Models')
plt.show()