In [4]:
import torch
from transformers import GPT2Tokenizer, GPT2Model
from typing import List, Tuple
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2Model.from_pretrained('gpt2')
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def sentences_to_gpt2_embeddings(sentences: List[str]) -> torch.Tensor:
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state


    attention_mask = inputs['attention_mask']
    masked_hidden = last_hidden_states * attention_mask.unsqueeze(-1)
    embeddings = masked_hidden.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)

    return embeddings

def to_vocab_words_and_vectors(sentences: List[str]) -> List[Tuple[List[str], torch.Tensor]]:
    results = []

    for sentence in sentences:
        words = word_tokenize(sentence.lower())

        tokens = []
        for word in words:
            token_ids = tokenizer.encode(word, add_special_tokens=False)
            tokens.extend([tokenizer.decode([tid]) for tid in token_ids])

        if not tokens:
            results.append(([], torch.tensor([])))
            continue

        inputs = tokenizer(tokens, return_tensors='pt', padding=True,
                         truncation=True, is_split_into_words=False)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state

        attention_mask = inputs['attention_mask']
        valid_embeddings = embeddings[attention_mask.bool()]

        results.append((tokens, valid_embeddings))

    return results


sentences = [
    "The quick brown fox jumps over the lazy dog",
    "A cat sits on the mat",
    "Sunny hills bloom with vivid colors"
]

sentence_embeddings = sentences_to_gpt2_embeddings(sentences)
print(f"Sentence embeddings shape: {sentence_embeddings.shape}")
for i, (sentence, embedding) in enumerate(zip(sentences, sentence_embeddings)):
    print(f"{sentence}")
    print(f"Embedding: {embedding[:5]}...")

word_results = to_vocab_words_and_vectors(sentences)
for i, (sentence, (words, word_embeddings)) in enumerate(zip(sentences, word_results)):
    print(f"{sentence}")
    print(f"Words: {words}")
    print(f"Word embeddings shape: {word_embeddings.shape}")
    for word, emb in zip(words, word_embeddings):
        print(f"Word: {word}, Embedding[:5]: {emb[:5]}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Sentence embeddings shape: torch.Size([3, 768])
The quick brown fox jumps over the lazy dog
Embedding: tensor([ 0.0475,  0.1346, -0.4781,  0.1008,  0.0408])...
A cat sits on the mat
Embedding: tensor([-0.4104,  0.0799, -0.7947,  0.2987, -0.1682])...
Sunny hills bloom with vivid colors
Embedding: tensor([-0.1217, -0.0164, -1.2168,  0.1202,  0.0949])...
The quick brown fox jumps over the lazy dog
Words: ['the', 'quick', 'brown', 'fox', 'j', 'umps', 'over', 'the', 'l', 'azy', 'dog']
Word embeddings shape: torch.Size([11, 768])
Word: the, Embedding[:5]: tensor([-0.1116, -0.0376, -0.2625,  0.0089, -0.0062])
Word: quick, Embedding[:5]: tensor([ 0.0089, -0.0820, -0.5968, -0.0222, -0.0429])
Word: brown, Embedding[:5]: tensor([-0.0561,  0.0763, -0.4878, -0.1101, -0.0216])
Word: fox, Embedding[:5]: tensor([-0.2120, -0.0151, -0.3472, -0.0302,  0.0186])
Word: j, Embedding[:5]: tensor([-0.0441, -0.1131, -0.3866,  0.0148,  0.0388])
Word: umps, Embedding[:5]: tensor([-0.2245, -0.0068, -0.3665,  0.006