<a href="https://colab.research.google.com/github/rb58853/images_RIS-ML-Conv-NLP/blob/main/end_model/caption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers



## Import librarys

In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import cv2

### Load Image

In [3]:
img_url = '/content/2.jpg'
raw_image = Image.open(img_url).convert("RGB")
image = cv2.imread(img_url)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Load all Models

## Segment Anything Model

In [4]:
import torchvision
print("PyTorch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print("CUDA is available:", torch.cuda.is_available())
import sys
!{sys.executable} -m pip install opencv-python matplotlib
!{sys.executable} -m pip install 'git+https://github.com/facebookresearch/segment-anything.git'
!wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth


import sys
sys.path.append("..")
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor

sam_checkpoint = "sam_vit_h_4b8939.pth"
model_type = "vit_h"

device = "cuda"

sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)

mask_generator = SamAutomaticMaskGenerator(sam)

PyTorch version: 2.1.0+cu118
Torchvision version: 0.16.0+cu118
CUDA is available: True
Collecting git+https://github.com/facebookresearch/segment-anything.git
  Cloning https://github.com/facebookresearch/segment-anything.git to /tmp/pip-req-build-pn5t7qbm
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/segment-anything.git /tmp/pip-req-build-pn5t7qbm
  Resolved https://github.com/facebookresearch/segment-anything.git to commit 6fdee8f2727f4506cfbbe553e23b895e27956588
  Preparing metadata (setup.py) ... [?25l[?25hdone
--2023-10-26 15:50:14--  https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 99.84.238.181, 99.84.238.206, 99.84.238.162, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|99.84.238.181|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2564550879 (2.4G) [binary/octet-stream]
Saving to: ‘sam_vit_h_4b8939.pth.3’



#### Create image from mask

In [5]:
from PIL import Image

def mask_image(mask, raw_image):
    weigth, heigth = raw_image.size
    new_image = Image.new('RGBA', (weigth, heigth), (0, 0, 0, 0))

    original_pixles = raw_image.load()
    pixels = new_image.load()

    for i in range (heigth):
        for j in range (weigth):
            if mask[i,j]:
                pixels[j, i] = original_pixles[j,i]
            else:
                pass
    return new_image

def bbox_image(bbox, image):
    x,y,w,h =  bbox[0],bbox[1],bbox[2],bbox[3]
    return image[y:y+h, x:x+w]

In [6]:
def all_areas_from_image(image, raw_image):
    masks = mask_generator.generate(image)
    images_box= []
    images_mask= []
    for mask in masks:
        images_box.append(bbox_image(mask['bbox'],image))
        images_mask.append(mask_image(mask['segmentation'], raw_image))
    return {'box':images_box, 'mask':images_mask}

## BLIP

In [7]:
from transformers import BlipProcessor, BlipForConditionalGeneration

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")

def blip (_image):
    inputs = blip_processor(_image, return_tensors="pt").to("cuda")
    out = blip_model.generate(**inputs)
    result = blip_processor.decode(out[0], skip_special_tokens=True)
    return result

def all_captions(image, raw_image):
    # areas = all_areas_from_image(image, raw_image)['mask']
    areas = all_areas_from_image(image, raw_image)['box']
    origin = str(blip(raw_image))
    captions = [origin]
    for im in areas:
        captions.append(origin +" "+ str(blip(im)))
    return captions

## CLIP

In [8]:
from transformers import CLIPProcessor, CLIPModel

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
clip_model = clip_model.to(device)

In [19]:
def select_caption(captions, image):
    inputs = clip_processor(text=captions, images=image, return_tensors="pt", padding=True)
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    outputs = clip_model(**inputs)

    logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    probs = logits_per_image.softmax(dim=1)
    print(probs)
    return select_from_probs(probs, captions)

def select_from_probs(probs, captions):
    max_prob = 0
    index = 0
    for i,prob in zip(range(len(probs[0])),probs[0]):
        if prob > max_prob:
            max_prob = prob
            index = i
    return captions[index]

# Run Model

In [10]:
captions = all_captions(image, raw_image)
captions.append("there is a cat and a dog laying on a bed together knife")
for caption in captions:
    print(caption)



there is a cat and a dog laying on a bed together
there is a cat and a dog laying on a bed together there is a cat that is laying down on a bed
there is a cat and a dog laying on a bed together there is a cat that is sitting on a bed with a pillow
there is a cat and a dog laying on a bed together there is a cat that is sitting on a bed with a blanket
there is a cat and a dog laying on a bed together there is a cat that is sitting in a bowl of food
there is a cat and a dog laying on a bed together there are two cats that are sitting together in a red chair
there is a cat and a dog laying on a bed together a close up of a cat with a blurry background
there is a cat and a dog laying on a bed together blurry image of a cat sitting in a room with a red wall
there is a cat and a dog laying on a bed together there is a close up of a person eating a piece of food
there is a cat and a dog laying on a bed together there is a black cat sitting on a table next to a laptop
there is a cat and a dog 

In [23]:
result = select_caption(captions, raw_image)
print(result)

tensor([[6.5799e-04, 1.0727e-03, 7.7942e-04, 8.3541e-04, 1.4952e-04, 6.5678e-04,
         9.6113e-03, 4.0556e-03, 4.6645e-03, 6.0604e-05, 1.6062e-02, 3.3200e-02,
         1.0789e-03, 1.8120e-02, 2.9229e-04, 1.6059e-02, 8.3541e-04, 1.6059e-02,
         2.1157e-04, 1.2755e-04, 1.2755e-04, 1.9606e-02, 1.8862e-03, 3.6882e-02,
         9.2014e-04, 8.3541e-04, 2.9504e-03, 1.7574e-04, 1.6400e-03, 7.2747e-03,
         3.1296e-05, 1.5634e-03, 1.6059e-02, 1.4272e-03, 1.3489e-02, 1.3496e-02,
         2.7279e-04, 3.1320e-04, 1.1643e-03, 5.2064e-03, 8.3541e-04, 4.5606e-04,
         3.4663e-04, 7.6761e-04, 1.1672e-04, 7.4757e-01]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
there is a cat and a dog laying on a bed together knife
