In [5]:
%pip install seaborn
%pip install transformers 
%pip install matplotlib pandas
%pip install protobuf==3.20
%pip install torch
%pip install datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting datasets
  Using cached datasets-2.15.0-py3-none-any.whl (521 kB)
Collecting dill<0.3.8,>=0.3.0
  Using cached dill-0.3.7-py3-none-any.whl (115 kB)
Collecting fsspec[http]<=2023.10.0,>=2023.1.0
  Using cached fsspec-2023.10.0-py3-none-any.whl (166 kB)
Collecting aiohttp
  Downloading aiohttp-3.9.1-cp310-cp310-macosx_11_0_arm64.whl (386 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.5/386.5 kB[0m [31m864.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pyarrow-hotfix
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting xxhash
  Downloading xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl (30 kB)
Colle

In [30]:
import torch
import seaborn as sns
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from datasets import load_dataset
import matplotlib.pyplot as plt
import pandas as pd

# Load the dataset
dataset = load_dataset('bigscience/P3', 'cos_e_v1.11_aligned_with_common_sense')
train_dataset = dataset['train']

# Initialize the tokenizer and models (one or continuous prompting and other for projected prompting
model_continuous = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')
model_projected = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')

tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_continuous.to(device)
model_projected.to(device)

# Define the prompt basis
prompt_list = [
    'When you see the following question, I would like you to answer it correctly',
    'Produce an executable artifact of type X that will answer the question, and then execute it',
    'When I ask you a question, generate three additional questions that would help you give a more accurate answer. When you then answered the three questions, combine the answers to produce the final answers to my original question  ',
    'Generate a set of facts that are contained in the output. The set of facts should be inserted in a specific point in the output to answer the question',
]

print(f'tokenizing')

basis = tokenizer(prompt_list, padding=True, truncation=True, return_tensors='pt').to(device)
basis = model_projected.model.shared(basis.input_ids)

# print(f'prompt basis shape: {basis.shape}')

import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the weight prediction model
class LearnWeights(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LearnWeights, self).__init__()
        self.attention = nn.MultiheadAttention(input_dim, hidden_dim)
        self.fc = nn.Linear(input_dim, output_dim)  # Change this line

    def forward(self, x):
        x, _ = self.attention(x, x, x)
        x = self.fc(x)
        return F.softmax(x, dim=-1).unsqueeze(-1).repeat(1, 1, 1024)

# Define the soft prompt
# we want good initialization of the soft prompt
soft_prompt = torch.mean(basis, dim=0)
soft_prompt = torch.nn.Parameter(soft_prompt)
optimizer_continuous = AdamW([soft_prompt])

# Define the projected prompt
input_dim = 1
hidden_dim = 1
output_dim = len(prompt_list)
learn_weights = LearnWeights(input_dim, hidden_dim, output_dim).to(device)
optimizer_projected = AdamW(learn_weights.parameters())

# Training parameters
epochs = 1
batch_size = 4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('starting training')

# Training loop
continuous_losses = []
projected_losses = []

for epoch in range(epochs):
    epoch_loss_continuous = 0
    epoch_loss_projected = 0
    for i in range(0, len(train_dataset) - 9000, batch_size):
        batch = train_dataset[i:i+batch_size]
        input_ids = tokenizer(batch['inputs_pretokenized'], return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
        labels = tokenizer(batch['targets_pretokenized'], return_tensors='pt', padding=True, truncation=True).input_ids.to(device)

        # Get the prompt input embeddings - same if continuous or projected
        input_embeddings = model_continuous.model.shared(input_ids)
        
        print(f'input embeddings shape: {input_embeddings.shape}')

        # add copies of soft prompt for each element in the batch
        soft_prompt_batch = soft_prompt.unsqueeze(0).repeat(batch_size, 1, 1).to(device)
        
        # print(f'soft prompt shape: {soft_prompt.shape}')
        #the goal is to have good initizlization of the soft prompt
        
        # add copies of projected prompt for each element in the batch
        weights = learn_weights(basis)
        projected_prompt_batch = (weights * basis).sum(dim=0).unsqueeze(0).repeat(batch_size, 1, 1).to(device)
        
        # print(f'shapes of soft batch and input embeddings: {soft_prompt_batch.shape}, {input_embeddings.shape}')
        
        combined_continuous_embeddings = torch.cat([soft_prompt_batch, input_embeddings], dim=1)
        combined_projected_embeddings = torch.cat([projected_prompt_batch, input_embeddings], dim=1)

        # Pass the combined embeddings through the model
        outputs_continuous = model_continuous(inputs_embeds=combined_continuous_embeddings, labels=labels)
        outputs_projected = model_projected(inputs_embeds=combined_projected_embeddings, labels=labels)

        loss_continuous = outputs_continuous.loss
        epoch_loss_continuous += loss_continuous.item()

        loss_projected = outputs_projected.loss
        epoch_loss_projected += loss_projected.item()

        optimizer_continuous.zero_grad()
        loss_continuous.backward(retain_graph=True)
        optimizer_continuous.step()

        optimizer_continuous.zero_grad()
        loss_projected.backward()
        optimizer_continuous.step()

        print(f'\r complete from this epoch {i}/{len(train_dataset)}', end='')
        print(f'\r loss continuous: {loss_continuous.item()}', end='')
        print(f'\r loss projected: {loss_projected.item()}', end='')

    epoch_loss_continuous /= len(train_dataset)
    epoch_loss_projected /= len(train_dataset)

    continuous_losses.append(epoch_loss_continuous)
    projected_losses.append(epoch_loss_projected)

    print(f'\r Epoch {epoch+1}/{epochs} complete. Loss: {epoch_loss_continuous}')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create a DataFrame with the loss values
data = {
    'Epoch': list(range(1, epochs + 1)) * 2,
    'Loss': continuous_losses + projected_losses,
    'Model': ['Continuous'] * epochs + ['Projected'] * epochs
}
df = pd.DataFrame(data)

# Create the plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Epoch', y='Loss', hue='Model')
plt.title('Loss per Epoch for Continuous and Projected Models')
plt.show()

tokenizing
starting training
input embeddings shape: torch.Size([4, 82, 1024])


AssertionError: was expecting embedding dimension of 1, but got 1024