In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
text = "A fantasy styled overhead shot of two zebras running from a cheetah in the African savannah"
prompt = f"""I will provide you with a caption for a photo , image , or painting . Your task is to generate the bounding boxes for the objects mentioned in the caption , along with a background prompt describing the scene . The images are of size 512 x512 , and the bounding boxes should not overlap or go beyond the image boundaries . Each bounding box should be in the format of ( object name , [ top - left x coordinate , top - left y coordinate , box width , box height ]) and include exactly one object . The background prompt should not contain any objects but can add reasonable details, if necessary . Please refer to the example below for the desired format .

Caption : A realistic image of four skiers standi"ng in a line on the snow near a palm tree
Objects : [ ('a skier ', [5 , 152 , 139 , 168]) , ('a skier ', [278 , 192 , 121 , 158]) , ('a skier ', [148 , 173 , 124 , 155]) , ('a palm tree ', [404 , 180 , 103 , 180]) ]
Background prompt : A realistic image of an outdoor scene with snow

Caption : Futuristic painting of a basketball player playing with a football in an indoor gym 
Objects : [ ('a basketball player ', [40 , 152 , 156 , 241]) , ('a football ', [237 , 17 , 54 , 32]) ]
Background prompt : Futuristic painting of an indoor gym

Caption : A pixelized image of a toy practicing his swordsmanship in a children's bedroom 
Objects : [ ('a toy ', [34 , 145 , 120 , 237]) ]
Background prompt : A pixelized image of a children's bedroom

Caption : An oil painting of dog swimming in a blue lake towards a large branch  
Objects : [ ('a dog ', [208 , 201 , 164 , 89]) , ('a large branch ', [156 , 219 , 51 , 29]) ]
Background prompt : An oil painting of a blue lake

Caption : A low light image of a tractor in a field at sunset
Objects : [ ('a tractor ', [246, 298, 165, 133]) ]
Background prompt : A low light image of a field at sunset

Caption : A surreal image of flower blooming in a pot
Objects : [ ('flower ', [215, 162, 123, 163]), ('a pot ', [217, 411, 149, 90]) ]
Background prompt :  A surreal image of the inside a house

Caption : A watercolor painting of a couple of pandas eating bamboo in a forest
Objects : [ ('a panda eating bamboo', [30, 133, 212, 226]), ('a panda eating bamboo', [262, 137, 222, 221]) ]
Background prompt : A watercolor painting of a forest

Caption : {text} """

### Finetuned GPT2-large

In [4]:
%%capture

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("checkpoints_large/checkpoint2.pt/").to(device)
model.eval()

In [12]:
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
input_length = inputs.input_ids.shape[1]
outputs = model.generate(
    **inputs, max_new_tokens=64, do_sample=True, temperature=0.7, top_p=0.9, top_k=50, return_dict_in_generate=True
)
token = outputs.sequences[0, input_length:]
output_str = tokenizer.decode(token)
print(output_str)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Objects: [ ('a cat', '[  1, 170, 125, 344]), ('a three dogs', '[  2,  80,  83, 264]')

('a three dogs', '[240,  87, 512, 512]')

('a three dogs', '[


### [MPT-1B-redpajama-200b-dolly](https://huggingface.co/mosaicml/mpt-1b-redpajama-200b-dolly)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-1b-redpajama-200b-dolly", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("mosaicml/mpt-1b-redpajama-200b-dolly", trust_remote_code=True).to(device)



You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


In [5]:
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
input_length = inputs.input_ids.shape[1]
outputs = model.generate(
    **inputs, max_new_tokens=64, do_sample=True, temperature=0.7, top_p=0.9, top_k=50, return_dict_in_generate=True
)
token = outputs.sequences[0, input_length:]
output_str = tokenizer.decode(token)
print(output_str)


Objects: [ ('a cheetah ', [150, 211, 151, 141]), ('a zebra ', [155, 219, 57, 29]) ]
Background prompt: A fantasy styled overhead shot of two zebras running from a cheetah in the African savannah<|endoftext|>Kansas City Chiefs quarterback


### [RedPajama-INCITE-Instruct-3B-v1](https://huggingface.co/togethercomputer/RedPajama-INCITE-Instruct-3B-v1)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Instruct-3B-v1")
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-Instruct-3B-v1", torch_dtype=torch.float16)
model = model.to('cuda:0')

In [10]:
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
input_length = inputs.input_ids.shape[1]
outputs = model.generate(
    **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.8, top_k=100, return_dict_in_generate=True
)
token = outputs.sequences[0, input_length:]
output_str = tokenizer.decode(token)
print(output_str)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



Objects : [ ('a cheetah', [172, 219, 23, 221]), ('a zebra', [207, 202, 167, 182]), ('a zebra', [201, 211, 184, 212]), ('two zebras', [170, 197, 170, 227]) ]
Background prompt : A fantasy styled overhead shot of two zebras running from a cheetah in the African savannah

Caption : A man is sitting on a bench while a group of people are walking by on the street
Objects : [ ('a man', [197, 196, 194, 209]), ('a bench',


### [RedPajama-INCITE-7B-Instruct](https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Instruct)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct")
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-7B-Instruct", torch_dtype=torch.float16)
model = model.to('cuda:0')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
input_length = inputs.input_ids.shape[1]
outputs = model.generate(
    **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.7, top_k=100, return_dict_in_generate=True
)
token = outputs.sequences[0, input_length:]
output_str = tokenizer.decode(token)
print(output_str)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



Objects : [ ('a cheetah ', [20, 90, 34, 213]), ('two zebras ', [35, 142, 87, 225]) ]
Background prompt : A fantasy styled overhead shot of two zebras running from a cheetah in the African savannah

Caption : A photo of a person on a skateboard in a skate park 
Objects : [ ('a skateboarder ', [35, 115, 25, 144]) ]
Background prompt : A photo of a person on a skateboard in a skate park

Caption : A woman in a black and white dress
