<a href="https://colab.research.google.com/github/sarenales/VSR-Retrieval/blob/main/Extraccion_resultados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OFA

In [None]:
!git clone --single-branch --branch feature/add_transformers https://github.com/OFA-Sys/OFA

In [None]:
!pip install OFA/transformers/

In [None]:
!git lfs install
!git clone https://huggingface.co/OFA-Sys/OFA-large

In [None]:
from transformers.models.ofa.generate import sequence_generator

In [None]:
from transformers import OFATokenizer, OFAModel
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ofa_model = OFAModel.from_pretrained("./OFA-large", torch_dtype=torch.bfloat16, use_cache=False).to(device)
ofa_tokenizer = OFATokenizer.from_pretrained("./OFA-large")

In [None]:
from PIL import Image
from torchvision import transforms
mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
resolution = 480 # LARGE --- 384  BASE

def load_image(image):
    patch_resize_transform = transforms.Compose([
            transforms.Resize((resolution, resolution), interpolation=Image.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)
        ])
    patch_img = patch_resize_transform(image).unsqueeze(0).to(device)
    return patch_img

# DATASET

In [None]:
!git clone https://github.com/cambridgeltl/visual-spatial-reasoning.git

In [None]:
import json
archivo_jsonl = "/content/visual-spatial-reasoning/data/data_files/all_vsr_validated_data.jsonl"
train = []
with open(archivo_jsonl, "r") as f:
  for linea in f:
    objeto = json.loads(linea)
    train.append(objeto)

In [None]:
diccionario_opuesto ={
    # Adjacency (10)
    "adjacent to":"alongside",
    "alongside": "Adjacent to",
    "at the side of":"against",
    "at the right side of":"at the left side of",
    "at the left side of": "at the right side of",
    "attached to":"against",
    "at the back of":"ahead of",
    "ahead of":"at the back of",
    "against": "ahead of",
    "at the edge of":"Adjacent to",

  # Directional (16)
    "off":"up",
    "past":"across from",
    "toward":"down",
    "down":"up",
    "deep down":"up",
    "up":"deep down",
    "away from":"across from",
    "along":"Adjacent to",
    "around":"off",
    "from":"away from",
    "into":"off",
    "to":"in front of",
    "across":"off"  ,
    "across from":"Adjacent to" ,
    "through":"Adjacent to",
    "down from":"into"    ,

  # Orientation (4)
    "facing":"facing away from"  ,
    "facing away from":"facing"  ,
    "parallel to":"perpendicular to"  ,
    "perpendicular to":"parallel to"  ,

  # Projective (12)
    "on top of":"beneath"  ,
    "beneath":"on top of"  ,
    "beside":"far from"  ,
    "behind":"in front of"  ,
    "left of":"right of"  ,
    "right of":"left of"  ,
    "under":"over"  ,
    "in front of":"behind"  ,
    "below":"above"  ,
    "above":"below"  ,
    "over":"under"  ,
    "in the middle of": "above",

    # Proximity (6)
    "by":"far from"  ,
    "close to": "far from" ,
    "near":"far from"  ,
    "far from":"close to"  ,
    "far away from":"close to"  ,

    # Topological (18)
    "connected to":"detached from"  ,
    "detached from":"part of"  ,
    "has as a part":"detached from"  ,
    "part of":"detached from"  ,
    "contains":"detached from"  ,
    "within":"out of"  ,
    "at":"out of"  ,
    "on":"out of"  ,
    "in":"out of"  ,
    "with":"out of"  ,
    "surrounding":"out of"  ,
    "among":"out of"  ,
    "consists of":"out of"  ,
    "out of":"between"  ,
    "between":"out to"  ,
    "inside":"outside"  ,
    "outside":"inside"  ,
    "touching": "detached from"  ,

    # Unallocated (6)
    "beyond":"enclosed by"  ,
    "next to":"beyond"  ,
    "opposite to":"Adjacent to"  ,
    "after":"among"  ,
    #"among":"after"  ,
    "enclosed by":"beyond"
}

In [None]:
def obtener_contrario(palabra, diccionario):
  if palabra in diccionario:
    return diccionario[palabra]
  else:
    return "Palabra no encontrada"

In [None]:
def saber_split(url):
  if 'train' in url:
    return "train"
  else:
    return "dev"

In [None]:
filtro = {}
cont = 0

for elemento in train:
  if elemento.get("label") == 1:

    nuevo_elemento = {
        "image" : elemento.get("image"),
        "image_link": elemento.get("image_link"),
        "caption+" : elemento.get("caption"),
        "caption-" :  elemento.get("caption").replace(elemento.get("relation"),  obtener_contrario(elemento.get("relation"), diccionario_opuesto)),
        "relation+" : elemento.get("relation"),
        "relation-" : obtener_contrario(elemento.get("relation"), diccionario_opuesto),
        "split": saber_split(elemento.get("image_link"))
    }

    filtro[cont] = nuevo_elemento
    cont +=1

with open("filtrado.json", "w") as archivo:
  json.dump(filtro, archivo)


print(f"Nuevo JSON creado exitosamente con los elementos filtrados. Número de elementos {cont}.")

# Extracción de características (Evaluación 1)

Aquí solamente almacenaremos los datos necesarios probados por el modelo.



Los datos se encuentran **"resultados1.json"**.

Tiempo de análisis aproximado: **1h 30min**

In [None]:
import json
import torch
import numpy as np
import time
import requests
from io import BytesIO
from torch import nn

from tqdm import tqdm

In [None]:
def token_imag(url):
  response = requests.get(url)
  image = Image.open(BytesIO(response.content))
  patch_img = load_image(image.convert("RGB"))
  return patch_img, image

In [None]:
def token_cap(caption):
  cap = "Does the image describe the following sentence?  " + caption
  text = ofa_tokenizer([cap], return_tensors="pt").to(device).input_ids
  return text

In [None]:
def input_model(caption, patch_img):
  gen_output_0 = ofa_model.generate(caption, patch_images=patch_img,num_beams=1, no_repeat_ngram_size=1,  return_dict_in_generate=True , output_scores=True )
  ofa_caption_0 = ofa_tokenizer.batch_decode(gen_output_0[0], skip_special_tokens=True)[0].strip()
  return ofa_caption_0, gen_output_0

In [None]:
with open("filtrado.json", "r") as f:
    datos = json.load(f)

In [None]:
def get_probabilidades(gen_output_0):
  probabilities = nn.functional.softmax(gen_output_0[1][0], dim=-1)  # asegurar YES/NO
  probYES = probabilities[0][tokens.get("yes")].item()
  probNO = probabilities[0][tokens.get("no")].item()
  return probYES, probNO

In [None]:
resultados = {}
tokens = ofa_tokenizer.get_vocab()

cont = 0

for indice, (clave,valor) in tqdm(enumerate(datos.items())):
  probYESP = 0.0
  probNOP = 0.0
  probYESN = 0.0
  probNON = 0.0
  respP = ""
  respN = ""

  url = valor["image_link"]
  patch_img = token_imag(url)[0]
  patch_img = patch_img.to(torch.bfloat16)

  caption = token_cap(valor["caption+"])
  ofa_caption_0, gen_output_0 = input_model(caption, patch_img)
  respP = ofa_caption_0

  probYESP, probNOP = get_probabilidades(gen_output_0)

  caption = token_cap(valor["caption-"])
  ofa_caption_0, gen_output_0 = input_model(caption, patch_img)
  respN = ofa_caption_0
  probYESN, probNON = get_probabilidades(gen_output_0)

  valor["PY+"] = probYESP
  valor["PN+"] = probNOP
  valor["response+"] = respP
  valor["PY-"] = probYESN
  valor["PN-"] = probNON
  valor["response-"] = respN
  resultados[cont] = valor
  # print(resultados[cont])
  cont += 1
  # if cont > 500:
  #   break

with open("resultados_large.json", "w") as archivo:
  json.dump(resultados, archivo)

In [None]:
with open("resultados_large.json", "w") as archivo:
  json.dump(resultados, archivo)

# Extracción de características (Evaluación 2)

Los datos se encuentran **"resultados_captioner1.json"**.

Tiempo de análisis aproximado: **2h 30min**

In [None]:
import json
import torch
import numpy as np
import time
import requests
from io import BytesIO
from torch import nn
import math
from tabulate import tabulate

In [None]:
def token_imag(url):
  response = requests.get(url)
  image = Image.open(BytesIO(response.content))
  patch_img = load_image(image.convert("RGB"))
  return patch_img, image

In [None]:
def token_cap(caption):
  text = ofa_tokenizer([caption], padding=True, truncation=True, return_tensors="pt").to(device).input_ids
  return text

In [None]:
import json
with open("filtrado.json", "r") as f:
    datos = json.load(f)

In [None]:
def probability_to_logit(p):
    p = p
    if p == 0.0:
        return -10000
    return np.log(p)

In [None]:
import math
def pro_logs(logits_output, caption=None, caption_ids=None):
  sum_pro = 0
  mul_pro = 0
  suma_log1 = 0
  suma_log2 = 0
  probabilities = nn.functional.softmax(logits_output.logits, dim=-1)
  for i in range(0, len(caption_ids[0]) - 1):
    current_id = input
    log_1 = logits_output.logits[0][i][caption_ids[0][i+1]].item()
    prob = probabilities[0][i][caption_ids[0][i+1]].item()
    log_2 = probability_to_logit(prob)
    suma_log1 += log_1
    sum_pro += prob
    mul_pro *= prob
    suma_log2 += log_2
  return  sum_pro, mul_pro ,suma_log1 ,suma_log2

In [None]:
resultados_captioner = {}

txt = "what does the image describe?"
inputs = token_cap(txt)
tokens = ofa_tokenizer.get_vocab()
cont = 0

for indice, (clave,valor) in tqdm(enumerate(datos.items())):
  url = valor["image_link"]
  patch_img = token_imag(url)[0]
  patch_img = patch_img.to(torch.bfloat16)
  decoder_input_P = token_cap(valor["caption+"])
  logits_output_P = ofa_model.forward(input_ids=inputs,patch_images=patch_img, decoder_input_ids=decoder_input_P)
  sum_pro_P, mul_pro_P ,suma_log1_P ,suma_log2_P = pro_logs(logits_output_P, caption_ids=decoder_input_P)
  valor["mul_prob+"] = mul_pro_P
  valor["suma_log1+"] = suma_log1_P
  valor["suma_log2+"] = suma_log2_P
  valor["prob_media+"] = sum_pro_P/len(partes(valor["caption+"]))

  decoder_input_N = token_cap(valor["caption-"])
  logits_output_N = ofa_model.forward(input_ids=inputs,patch_images=patch_img, decoder_input_ids=decoder_input_N)
  sum_pro_N, mul_pro_N ,suma_log1_N ,suma_log2_N = pro_logs(logits_output_N, caption_ids=decoder_input_N)
  valor["mul_prob-"] = mul_pro_N
  valor["suma_log1-"] = suma_log1_N
  valor["suma_log2-"] = suma_log2_N
  valor["prob_media-"] = sum_pro_N/len(partes(valor["caption-"]))

  resultados_captioner[cont] = valor
  cont += 1

with open("resultados_captioner_Large_17.json", "w") as archivo:
  json.dump(resultados_captioner, archivo)

In [None]:
with open("resultados_captioner_Large_17.json", "w") as archivo:
  json.dump(resultados_captioner, archivo)