## git

### prepare images

In [18]:
from PIL import Image
import requests
from io import BytesIO


def prepare_image(url):
    response = requests.get(url)
    image = Image.open(BytesIO(response.content)).convert("RGB")
    return image


#url = 'https://img.youtube.com/vi/avlOGya53IU/hqdefault.jpg'
#image = prepare_image(url=url)

url_list = ['https://img.youtube.com/vi/avlOGya53IU/hqdefault.jpg',
            'https://img.youtube.com/vi/WmPZgAfhaqg/hqdefault.jpg',
            'https://img.youtube.com/vi/ntaO3-n-isc/hqdefault.jpg',
            'https://img.youtube.com/vi/rR3qkkFNw8k/hqdefault.jpg',
            'https://img.youtube.com/vi/ZQzJoL-LTbM/hqdefault.jpg',
            'https://img.youtube.com/vi/2OSrvzNW9FE/hqdefault.jpg',
            'https://img.youtube.com/vi/60RFIF9y8fY/hqdefault.jpg',
            'https://img.youtube.com/vi/5abffC-K40c/hqdefault.jpg',
            'https://img.youtube.com/vi/U8_tbETGuTs/hqdefault.jpg',
            'https://img.youtube.com/vi/eGHezLPpL3Y/hqdefault.jpg'
            ]
images = [prepare_image(url=u) for u in url_list]

### select models and generate captions

In [2]:
from transformers import AutoProcessor, AutoModelForCausalLM
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [44]:
#model_name = 'microsoft/git-large-textcaps'
#model_name= 'microsoft/git-base-textcaps'
#model_name= 'microsoft/git-large-coco'
model_name= 'microsoft/git-base-coco'

In [45]:

def generate_caption_in_batch(image, processor, device, model):
    inputs = processor(images=image, return_tensors="pt")
    pixel_values = inputs.pixel_values.to(device)
    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    return processor.batch_decode(generated_ids, skip_special_tokens=True)

def main(model_name, images):
    print(f"num images: {len(images)}")
    print(f"model: {model_name}")
    processor = AutoProcessor.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)   
    captions = [generate_caption_in_batch(image, processor, device, model)[0] 
                for image in images]
    return captions

In [46]:
captions = main(model_name, images)
captions

num images: 10
model: microsoft/git-base-coco


['a man playing the piano in an airport.',
 'a cartoon of a blue character with a funny face on a blackboard.',
 'a man using a tablet with a tablet',
 'video game developer has released a new video game for the first time',
 'a man holding a flag and a plate of food.',
 'a person throwing a basketball into a pond.',
 'a picture of a sign with a flag and a man in a hat.',
 'a man with glasses and a beard is looking at a computer screen.',
 'a picture of a man in a leather jacket and a leather jacket.',
 'a cat and a dog are playing together.']

In [15]:
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


GitForCausalLM(
  (git): GitModel(
    (embeddings): GitEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (image_encoder): GitVisionModel(
      (vision_model): GitVisionTransformer(
        (embeddings): GitVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
          (position_embedding): Embedding(257, 1024)
        )
        (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (encoder): GitVisionEncoder(
          (layers): ModuleList(
            (0): GitVisionEncoderLayer(
              (self_attn): GitVisionAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              

In [16]:

inputs = processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values.to(device)

generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))

Generated caption: ['a man playing a piano in a train station']
