In [2]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
import torch

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt").to(device)

In [4]:
input_text = "The quick brown fox jumps over the lazy dog"

input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

print(input_ids.shape)

output = model.generate(input_ids, max_length=100, num_return_sequences=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id)

print(output.shape)

for i, sample_output in enumerate(output):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

torch.Size([1, 9])
torch.Size([5, 100])
0: the quick brown fox jumps over the lazy dog and into the basket. the dog begins to bark, and the fox begins jumping. " now, now... it's just a game. i'm not going to hurt you, little girl. you're not a threat. look at the big cat. he's probably just trying to show you how tough he is. it won't hurt, you 'll see. just get in the car. and don't worry about the cat, okay? it will
1: the quick brown fox jumps over the lazy dog. 
 " what's the matter? " i ask, grabbing the towel and wiping my face. " didn't you like it? i'm sorry. i did a good job. it's just that - " 
 he walks to the kitchen table, and i follow, not sure what else to say. he's going to talk about it, i can tell. but he doesn't. instead, he takes a seat at the table and picks up
2: the quick brown fox jumps over the lazy dog, landing on his back and his front paws, growling, and then he growls, " what the fuck? " 
 the dog jumps off the bed, knocking the chair over, as the other do

In [1]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
import torch

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)

In [3]:
input_text = "The quick brown fox jumps over the lazy dog"

input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

print(input_ids.shape)

output = model.generate(input_ids, max_length=100, num_return_sequences=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id)

print(output.shape)

for i, sample_output in enumerate(output):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
    

torch.Size([1, 9])
torch.Size([5, 100])
0: The quick brown fox jumps over the lazy dog and then leaps into the river to drown it.

I am a dog-hater and I am so proud of it!
The dog is so adorable! I love the way the dog treats and how the other dog's arms are attached. It's adorable, I just love it so much.
1: The quick brown fox jumps over the lazy dog, as the fox tries to keep up with his owner.

“It looks like a great fox,‼†‏”
—‭The fox is now taking over for a new home. ~~
It is a cute fox with a large coat and a long, long tail, with short tail. It looks really nice, and is adorable. As the dog moves, he and the other dogs move quickly.
2: The quick brown fox jumps over the lazy dog, then they fall off the ground, and then the dog falls back into the pit.

The dog has died. He will be buried in the park in a private cemetery.
3: The quick brown fox jumps over the lazy dog with a flick of his chin, then a bright red fox looks up and walks up to the dog.

The dog is sitting on the g

In [4]:
from model import GPT2WithImageEmbeddings

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = GPT2Config.from_pretrained("distilgpt2", add_cross_attention=True)
model = GPT2WithImageEmbeddings.from_pretrained("distilgpt2", config=config).to(device)

Some weights of GPT2WithImageEmbeddings were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.2.crossattention.c_attn.bias', 'transformer.h.2.crossattention.c_attn.weight', 'transformer.h.2.crossattention.c_proj.bias', 'transforme

In [8]:
input_text = ["The quick brown fox jumps over the lazy dog", "The quick brown fox jumps over the lazy dog"]

input_ids = tokenizer(input_text, return_tensors="pt", padding="max_length", max_length=1024)["input_ids"].to(device)

print(input_ids.shape)

image_embeddings = torch.randn(2, 3600, 768).to(device)

print(image_embeddings.shape)

model.train()
output = model(input_ids, image_embeddings=image_embeddings, labels=input_ids)

print(output.loss)
print(output.logits.shape)


torch.Size([2, 1024])
torch.Size([2, 3600, 768])
tensor(8.5497, device='cuda:0', grad_fn=<NllLossBackward0>)
torch.Size([2, 1024, 50257])


In [16]:
output

CausalLMOutputWithCrossAttentions(loss=tensor(8.6937, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[[-31.6941, -29.4798, -31.2177,  ..., -42.1512, -42.1393, -31.2940],
         [-45.3585, -46.4409, -50.1548,  ..., -53.8235, -52.9681, -47.9801],
         [-47.8118, -49.2812, -53.6779,  ..., -58.7576, -55.6396, -52.5801],
         ...,
         [-31.1591, -27.7065, -30.1943,  ..., -43.5227, -43.5939, -30.3264],
         [-31.0014, -27.5555, -30.0292,  ..., -43.3821, -43.4517, -30.1738],
         [-31.7184, -28.1797, -30.7296,  ..., -43.9168, -43.9317, -31.0208]],

        [[-31.6954, -29.4826, -31.2204,  ..., -42.1536, -42.1420, -31.2960],
         [-45.3967, -46.4886, -50.2013,  ..., -53.8658, -52.9994, -48.0197],
         [-47.7919, -49.2751, -53.6651,  ..., -58.7349, -55.6187, -52.5559],
         ...,
         [-31.1632, -27.7157, -30.2012,  ..., -43.5266, -43.5968, -30.3331],
         [-31.0053, -27.5645, -30.0359,  ..., -43.3859, -43.4546, -30.1804],
         [-31.72

In [15]:
# process output to get text

output_ids = torch.argmax(output.logits, dim=-1)

print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

 The andiees out the fence fox and


In [6]:
input_text = ["The quick brown fox jumps over the lazy dog"]

input_ids = tokenizer(input_text, return_tensors="pt", padding="max_length", max_length=1024)["input_ids"].to(device)

print(input_ids.shape)

image_embeddings = torch.randn(1, 10, 768).to(device)

print(image_embeddings.shape)

model.train()
text_embeddings = model.transformer.wte(input_ids)
print(text_embeddings.shape)

# output = model(, labels=input_ids)

# print(output.loss)
# print(output.logits.shape)

torch.Size([1, 1024])
torch.Size([1, 10, 768])
torch.Size([1, 1024, 768])
