In [1]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Set the path to dataset
import os

home = '/content/drive/MyDrive/2ReadyForFineTuning/VLQAv1/Blocksworld/'

imroot = home+'merged_images'
os.chdir(home)

In [3]:
# Prepare dataset in a desired format
prefix = 'blocksworld'

import json
import ast
datalen = 0
dumpdict = { "info": {}, "task_type": "Open-Ended", "data_type": "blocksworld", "license": {}, "data_subtype": prefix, "questions": [] }

with open(home+prefix+'.jsonl') as f:
  data = f.readlines()
  print(len(data))
  global datalen
  datalen = len(data)
  for i in data:
    ijson = json.loads(i)
    imlist = ast.literal_eval(ijson['images'])
    #print(imlist)
    anslist = ast.literal_eval(ijson['answer_choices'])
    if len(anslist)==2:
      anslist.append("Unknown")
      anslist.append("Unanswerable")
    try:
      #print(len(anslist))
      assert len(anslist) == 4
    except:
      print(i)
    im1 = imlist[0].lstrip('./images/').rstrip('.jpg')
    im2 = imlist[1].lstrip('./images/')
    dumpdict['questions'].append( { "image_id": imroot+"/Merged_"+im1+"#"+im2, "question": ijson['question']+" "+ijson['passage'], "question_id": ijson['qid'], "answer_choices": anslist, "answer_id": int(ijson['answer']) } )

with open('/content/'+prefix+'.jsonl','w+') as w:
  w.write(json.dumps(dumpdict,indent=4))

print(dumpdict)

250
{'info': {}, 'task_type': 'Open-Ended', 'data_type': 'blocksworld', 'license': {}, 'data_subtype': 'blocksworld', 'questions': [{'image_id': '/content/drive/MyDrive/2ReadyForFineTuning/VLQAv1/Blocksworld/merged_images/Merged_BW_1_0#BW_1_1.jpg', 'question': 'How many moves are required at minimum if configuration in image [0] is to be transformed into configuration in image [1]? Consider 6 blocks of colors [Red, Green, Purple, Orange, Yellow and Blue]. Blocks can be moved as per three conditions below. A block can be moved if there is no other block on it. At each time stamp only one block can be moved. A block can be moved OnTable, OutOfTable or on any other block.', 'question_id': 0, 'answer_choices': [1, 5, 2, 3], 'answer_id': 0}, {'image_id': '/content/drive/MyDrive/2ReadyForFineTuning/VLQAv1/Blocksworld/merged_images/Merged_BW_2_0#BW_2_1.jpg', 'question': 'How many moves are required at minimum if configuration in image [0] is to be transformed into configuration in image [1]? 

In [4]:
import json
f = open('/content/'+prefix+'.jsonl','r')
data_questions = json.load(f)
questions = data_questions['questions']

print(data_questions.keys())
print("Number of questions:", len(questions))

dict_keys(['info', 'task_type', 'data_type', 'license', 'data_subtype', 'questions'])
Number of questions: 250


In [5]:
# Install necessary libraries
!pip install torch
!pip install salesforce-lavis
!pip install transformers



In [6]:
# Import necessary libraries
import torch
from PIL import Image
from lavis.models import load_model_and_preprocess
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from transformers import AutoProcessor, AutoModelForCausalLM
from huggingface_hub import hf_hub_download

  return torch.cuda.amp.custom_fwd(orig_func)  # type: ignore
  return torch.cuda.amp.custom_bwd(orig_func)  # type: ignore


In [7]:
# Check if GPU is available (recommended but not necessary)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [8]:
# Load BLIP model finetuned on VQAv2
model_blip_vqa2, vis_processors_blip_vqa2, txt_processors_blip_vqa2 = load_model_and_preprocess(name="blip_vqa", model_type="vqav2", is_eval=True, device=device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

100%|██████████| 1.35G/1.35G [00:49<00:00, 29.3MB/s]
  checkpoint = torch.load(cached_file, map_location="cpu")


# Batch Run

In [9]:
# Create a batch run script that executes BLIP on an item from the dataset
def batchrun(impath, question, answers):
  raw_image = Image.open(impath).convert("RGB")

  image_blip_vqa2 = vis_processors_blip_vqa2["eval"](raw_image).unsqueeze(0).to(device)
  question_blip_vqa2 = txt_processors_blip_vqa2["eval"](question)
  samples_blip_vqa2 = {"image": image_blip_vqa2, "text_input": question_blip_vqa2}
  bvqa2 = model_blip_vqa2.predict_answers(samples_blip_vqa2, answer_list=answers, inference_method="rank")[0]

  return bvqa2


In [10]:
# Call batch run script and write results as csv file
import csv
filprefix = "IPQ_BLIP_"+prefix.upper()

with open(filprefix+'.csv', 'w+') as k:
  spamwriter = csv.writer(k)
  spamwriter.writerow(["qid","gt_ans","pred_ans_blip","correctness_blip"])
  for exid in range(0,datalen):
    print(exid)
    correctness_blip = 0
    print(questions[exid]['image_id'], questions[exid]['question'], questions[exid]['answer_choices'])
    pred_ans_blip = batchrun(questions[exid]['image_id'], questions[exid]['question'], [str(i) for i in questions[exid]['answer_choices']])
    gtclass = questions[exid]['answer_choices'][int(questions[exid]['answer_id'])]
    if str(gtclass)==str(pred_ans_blip):
      correctness_blip = 1
    print(exid,gtclass,pred_ans_blip,correctness_blip)
    spamwriter.writerow([exid,gtclass,pred_ans_blip,correctness_blip])

0
/content/drive/MyDrive/2ReadyForFineTuning/VLQAv1/Blocksworld/merged_images/Merged_BW_1_0#BW_1_1.jpg How many moves are required at minimum if configuration in image [0] is to be transformed into configuration in image [1]? Consider 6 blocks of colors [Red, Green, Purple, Orange, Yellow and Blue]. Blocks can be moved as per three conditions below. A block can be moved if there is no other block on it. At each time stamp only one block can be moved. A block can be moved OnTable, OutOfTable or on any other block. [1, 5, 2, 3]
0 1 5 0
1
/content/drive/MyDrive/2ReadyForFineTuning/VLQAv1/Blocksworld/merged_images/Merged_BW_2_0#BW_2_1.jpg How many moves are required at minimum if configuration in image [0] is to be transformed into configuration in image [1]? Consider 6 blocks of colors [Red, Green, Purple, Orange, Yellow and Blue]. Blocks can be moved as per three conditions below. A block can be moved if there is no other block on it. At each time stamp only one block can be moved. A blo