In [10]:
import torch
import seaborn as sns
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from datasets import load_dataset
import matplotlib.pyplot as plt
import pandas as pd

In [11]:
# Load the dataset
dataset = load_dataset('bigscience/P3', 'cos_e_v1.11_aligned_with_common_sense')
train_dataset = dataset['train']

# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')
model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [12]:
#Learn the latent space
class Autoencoder(torch.nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(input_dim, latent_dim),
            torch.nn.ReLU(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(latent_dim, input_dim),
            torch.nn.ReLU(),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

    def encode(self, x):
        return self.encoder(x)

In [17]:
latent_dim = 16
d = model.model.shared.embedding_dim
batch_size = 8
epochs = 10 

In [18]:
autoencoder = Autoencoder(d, latent_dim).to(device)
autoencoder_optimizer = AdamW(autoencoder.parameters())

print('Training Autoencoder')
for epoch in range(epochs):
    epoch_loss = 0
    print(f'Epoch {epoch + 1}/{epochs}')
    for i in range(0, len(train_dataset), batch_size):
        batch = train_dataset[i:i+batch_size]
        input_ids = tokenizer(batch['inputs_pretokenized'], return_tensors='pt', padding=True, truncation=True).input_ids
        input_embeddings = model.model.shared(input_ids.to(device))
        latent_representations, reconstructed_embeddings = autoencoder(input_embeddings)
        autoencoder_loss = torch.mean((reconstructed_embeddings - input_embeddings) ** 2)
        print(f'\r complete from this epoch {i}/{len(train_dataset)} with loss {autoencoder_loss} latent shape {latent_representations.shape}', end='')
        epoch_loss += autoencoder_loss.item()
        autoencoder_loss.backward()
        autoencoder_optimizer.step()
        autoencoder_optimizer.zero_grad()
    print(f'Epoch {epoch + 1} Loss: {epoch_loss}')  # print average loss per epoch

Training Autoencoder
Epoch 1/10
 complete from this epoch 9736/9741 with loss 0.005565587896853685 latent shape torch.Size([5, 67, 16])Epoch 1 Loss: 7.068610766436905
Epoch 2/10
 complete from this epoch 9736/9741 with loss 0.005535515956580639 latent shape torch.Size([5, 67, 16])Epoch 2 Loss: 6.445197727298364
Epoch 3/10
 complete from this epoch 9736/9741 with loss 0.005521970335394144 latent shape torch.Size([5, 67, 16])Epoch 3 Loss: 6.422227872302756
Epoch 4/10
 complete from this epoch 9736/9741 with loss 0.005511742550879717 latent shape torch.Size([5, 67, 16])Epoch 4 Loss: 6.408669068943709
Epoch 5/10
 complete from this epoch 9736/9741 with loss 0.005474542733281851 latent shape torch.Size([5, 67, 16])Epoch 5 Loss: 6.3756560299079865
Epoch 6/10
 complete from this epoch 9736/9741 with loss 0.005467498209327459 latent shape torch.Size([5, 67, 16])Epoch 6 Loss: 6.354588527465239
Epoch 7/10
 complete from this epoch 9736/9741 with loss 0.005461387801915407 latent shape torch.Size(

In [19]:
# Select a sample
sample = train_dataset[0]

# Tokenize the sample and get input embeddings
input_ids = tokenizer(sample['inputs_pretokenized'], return_tensors='pt').input_ids
input_embeddings = model.model.shared(input_ids.to(device))

# Pass the input embeddings through the autoencoder
latent_representations, reconstructed_embeddings = autoencoder(input_embeddings)

# Print the input, latent representation, and reconstruction
print(f'Input: {input_embeddings}')
print(f'Latent Representation: {latent_representations}')
print(f'Reconstruction: {reconstructed_embeddings}')


Input: tensor([[[-0.0369,  0.0782,  0.1621,  ...,  0.1831,  0.0589, -0.0659],
         [ 0.0068, -0.0898, -0.0970,  ...,  0.0197, -0.0449,  0.0517],
         [-0.0511,  0.1375,  0.0367,  ...,  0.0351,  0.0063, -0.1656],
         ...,
         [ 0.0224, -0.0013, -0.0172,  ..., -0.0598, -0.0188, -0.0996],
         [-0.0130, -0.0108, -0.0355,  ...,  0.0019, -0.0334,  0.0082],
         [-0.0471,  0.4563, -0.0644,  ...,  0.1069,  0.0339,  0.0493]]],
       device='cuda:0', grad_fn=<EmbeddingBackward0>)
Latent Representation: tensor([[[0.0000e+00, 0.0000e+00, 6.1153e-01,  ..., 1.3335e+00,
          2.9484e+00, 1.0124e+00],
         [2.8248e-01, 1.4828e-02, 4.3686e-01,  ..., 3.9212e-02,
          1.1129e-01, 1.1596e+00],
         [1.2981e+00, 0.0000e+00, 1.2213e+00,  ..., 1.2976e+00,
          1.3611e+00, 9.3695e-01],
         ...,
         [3.4810e-01, 8.0616e-02, 5.8862e-01,  ..., 1.2121e-01,
          5.9638e-01, 5.7909e-01],
         [3.3136e-01, 1.2191e-01, 3.2197e-01,  ..., 7.0938e-04,
