# Colab-BLIP

Original repo: [salesforce/BLIP](https://github.com/salesforce/BLIP)

Official Colab: [here](https://colab.research.google.com/github/salesforce/BLIP/blob/main/demo.ipynb)

My fork: [styler00dollar/Colab-BLIP](https://github.com/styler00dollar/Colab-BLIP)

A small colab which does use the large models instead of the base models.

In [None]:
!nvidia-smi

In [None]:
#@title install
%cd /content/
!pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4 wget
!git clone https://github.com/salesforce/BLIP
%cd BLIP

In [None]:
!pip install wget

# IMG2TXT

In [None]:
#@title load model `BLIP w/ ViT-L 129M` (image -> text)
%cd /content/BLIP
from models.blip import BLIP_Decoder, load_checkpoint
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
import os 
import wget 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

image_size = 384
transform = transforms.Compose([
    transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 

model_path = '/content/BLIP/model_large.pth'
if not os.path.isfile(model_path):
  wget.download("https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth")
    
model = BLIP_Decoder(image_size=384, vit='large')
model, _ = load_checkpoint(model, model_path)
model.eval()
model = model.to(device)

In [None]:
#@title load model `Image Captioning (COCO) BLIP w/ ViT-L` (image -> text)
%cd /content/BLIP
from models.blip import BLIP_Decoder, load_checkpoint
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
import wget
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

image_size = 384
transform = transforms.Compose([
    transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 

model_path = '/content/BLIP/model_large_caption.pth'
if not os.path.isfile(model_path):
  wget.download("https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth")

model = BLIP_Decoder(image_size=384, vit='large')
model, _ = load_checkpoint(model, model_path)
model.eval()
model = model.to(device)

In [None]:
#@title inference (image -> text)
%cd /content/BLIP

img_url = "URL" #@param {type:"string"}
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')   

w,h = raw_image.size
display(raw_image.resize((w//5,h//5)))

image = transform(raw_image).unsqueeze(0).to(device)   

with torch.no_grad():
    caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5)
    print('caption: '+caption[0])

# VQA

In [None]:
#@title load model `BLIP w/ ViT-B and CapFilt-L VQA` (answering question)
%cd /content/BLIP
from models.blip_vqa import BLIP_VQA, load_checkpoint
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

image_size = 480
transform = transforms.Compose([
    transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 

!wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth
model_path = "/content/BLIP/model*_vqa.pth"

model = BLIP_VQA(image_size=480, vit='large')
model, _ = load_checkpoint(model, model_path)
model.eval()
model = model.to(device)


In [None]:
#@title load model `BLIP w/ ViT-B VQA` (answering question)
%cd /content/BLIP
from models.blip_vqa import BLIP_VQA, load_checkpoint
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

image_size = 480
transform = transforms.Compose([
    transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 

!wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth
model_path = "/content/BLIP/model_vqa.pth"

model = BLIP_VQA(image_size=480, vit='large')
model, _ = load_checkpoint(model, model_path)
model.eval()
model = model.to(device)


In [None]:
#@title inference VQA
img_url =  "URL"#@param {type:"string"}
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')   

w,h = raw_image.size
display(raw_image.resize((w//5,h//5)))

image = transform(raw_image).unsqueeze(0).to(device)   

question = 'is she happy?' #@param {type:"string"}

with torch.no_grad():
    answer = model(image, question, train=False, inference='generate') 
    print('answer: '+answer[0])