<a href="https://colab.research.google.com/github/samp3209/capstone/blob/main/Blip_Mass_Output.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# imports and model download


In [None]:
#download model
import sys
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    !pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4
    !git clone https://github.com/salesforce/BLIP
    %cd BLIP

In [None]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#Link google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Define the path of the folder containing the images
folder_path = '/content/drive/MyDrive/MassOutput/Test/'

# Create an empty list to store the file paths of the images
file_paths = []

# Loop through all the files in the folder
for filename in os.listdir(folder_path):
  # Check if the file is an image (ends with .jpg, .jpeg, or .png)
  if filename.endswith(('.jpg', '.jpeg', '.png')):
    # Get the file path of the image
    file_path = os.path.join(folder_path, filename)
    # Append the file path to the list
    file_paths.append(file_path)

# Print the file paths of the images
print(file_paths)

In [None]:
#opens image from path then stores to list as pil image
image_list = []
for path in file_paths:
  with Image.open(path).convert('RGB') as img:
      image_list.append(img)

In [None]:
#function for loading images and then transforming them to tensors for captioning
def load_images(raw_image):
  w,h = raw_image.size
  image_size = 512
  #display(raw_image.resize((w//5,h//5)))
    
  transform = transforms.Compose([
        transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ]) 
  image = transform(raw_image).unsqueeze(0).to(device)   
  return image

In [None]:
#transforms inputs and stores them to list 
inputs = []
for image in image_list:
    x = load_images(image)
    inputs.append(x)

In [None]:
#blips through images in input and stores caption to a list
captions = []
from models.blip import blip_decoder

image_size = 512
for image in inputs:
      model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth'
      model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base')
      model.eval()
      model = model.to(device)

      with torch.no_grad():
          # beam search
          caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5)
          # nucleus sampling
          # caption = model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5) 
          print('caption: '+caption[0])
          captions.append(caption)

In [None]:
#Prompts blip with questions about the image 
q1s = []
q2s = []
q3s = []
q4s = []
q5s = []
from models.blip_vqa import blip_vqa

image_size = 512
model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth'
model = blip_vqa(pretrained=model_url, image_size=image_size, vit='base')
model.eval()
model = model.to(device)
for image in inputs:   
    question = 'what is the style of the image?'
    question2 = 'what are the colors predominant in the image?'
    question3 = 'what is the mood of the image?'
    question4 = 'what is the subject doing in this image?'
    question5 = 'where does this image take place?'
    with torch.no_grad():
        answer = model(image, question, train=False, inference='generate') 
        answer2 = model(image, question2, train=False, inference='generate') 
        answer3 = model(image, question3, train=False, inference='generate') 
        answer4 = model(image, question4, train=False, inference='generate') 
        answer5 = model(image, question5, train=False, inference='generate') 

        print('answer: '+answer[0])
        q1s.append(answer[0])
        q2s.append(answer2[0])
        q3s.append(answer3[0])
        q4s.append(answer4[0])
        q5s.append(answer5[0])

In [None]:
new_captions = []
for i in range(len(captions)):
  nc =  captions[i] + ',' + q1s[i] + ',' + q2s[i] + ',' + q3s[i] + ',' + q4s[i] + ',' + q5s[i] + '.png'
  new_captions.append(nc)


In [None]:
#matches the caption to the file path of the images to re name file to caption
path = '/content/drive/MyDrive/MassOutput/Test/'
for i in range(len(file_paths)):
  new_name = path + new_captions[i]
  old_name = file_paths[i]
  os.rename(old_name,new_name)