Connected to ContextEngineering (Python 3.11.14)

In [7]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2Model.from_pretrained('gpt2')
captions = [
    "example caption",
    "example bird",
    "the bird is yellow has red wings",
    "hi",
    "very good"
]

# Tokenize and pad sequences
encoded_captions = tokenizer(
    captions,
    return_tensors='pt',
    padding=True,
    truncation=True
)
input_ids = encoded_captions['input_ids']

# Forward pass to get embeddings
with torch.no_grad():
    outputs = model(input_ids)

# Extract embeddings
word_embeddings = outputs.last_hidden_state

# Mask to ignore padding tokens
masked_word_embeddings = word_embeddings * encoded_captions.attention_mask.unsqueeze(-1).float()

# Sum pooling considering only non-padding tokens
sentence_embeddings = masked_word_embeddings.sum(dim=1)

# Normalize by the count of non-padding tokens
sentence_embeddings /= encoded_captions.attention_mask.sum(dim=1, keepdim=True).float()

In [8]:
print("Sentence Embeddings:", sentence_embeddings)

Sentence Embeddings: tensor([[-0.2735, -0.0912, -0.4653,  ..., -0.2039,  0.0174, -0.1937],
        [-0.0682,  0.1680, -0.6740,  ...,  0.0546, -0.0863, -0.1632],
        [-0.1225,  0.3146, -1.3164,  ...,  0.1130, -0.0947, -0.1404],
        [-0.0306, -0.1695, -0.2591,  ..., -0.1182, -0.0145, -0.0746],
        [ 0.1904, -0.3096, -0.2606,  ...,  0.0173, -0.0888,  0.0728]])
