# Calculate the likelihood of winoground captions using RoBERTa base

## Download and preprocess the winoground dataset

In [2]:
!pip install transformers
!pip install datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from datasets import load_dataset
from torchvision import transforms
from torch.utils.data import DataLoader
from PIL import Image
import torch
import numpy
import matplotlib.pyplot as plt
from tqdm import tqdm

auth_token = "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"  # Replace with an auth token, which you can get from your huggingface account: Profile -> Settings -> Access Tokens -> New Token
winoground = load_dataset("facebook/winoground", use_auth_token=auth_token)["test"]

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize
    transforms.ToTensor()           # Convert to PyTorch tensor
])

def transform_wino(examples):
    examples["image_0"] = [transform(image.convert("RGB")) for image in examples["image_0"]]
    examples["image_1"] = [transform(image.convert("RGB")) for image in examples["image_1"]]
    return examples

winoground.set_transform(transform_wino)


Found cached dataset winoground (/Users/simrankhanuja/.cache/huggingface/datasets/facebook___winoground/default/0.0.0/72585f4d9cd5a28790bb9bc2adbdd45633f36dfbf85df529e0756e114e134285)


  0%|          | 0/1 [00:00<?, ?it/s]

## Calculate log likelihood of captions

In [7]:
from sentence_transformers import SentenceTransformer, util
import json
model = SentenceTransformer('all-mpnet-base-v2')

# Two lists of sentences
captions0 = []
captions1 = []

for sample in winoground:
    captions0.append(sample["caption_0"])
    captions1.append(sample["caption_1"])

# Compute embedding for both lists
embeddings1 = model.encode(captions0, convert_to_tensor=True)
embeddings2 = model.encode(captions1, convert_to_tensor=True)

# Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

sentence_similarity = {}
count = 0
# Output the pairs with their score
for i in range(len(captions0)):
    sentence_similarity[count] = float(cosine_scores[i][i].numpy())
    count += 1
    sentence_similarity[count] = float(cosine_scores[i][i].numpy())
    count += 1

In [8]:
with open("sentence_trans_sim.json", 'w') as f:
  json.dump(sentence_similarity, f)

In [4]:
from transformers import ViTImageProcessor, ViTModel
import torch.nn.functional as F
 
processor = ViTImageProcessor.from_pretrained('facebook/dino-vitb8')
model = ViTModel.from_pretrained('facebook/dino-vitb8').eval()

batch_size = 16

cosine_similarities = []
for i in range(0, len(winoground), batch_size):
    end_idx = i+batch_size if i+batch_size < len(winoground) else len(winoground)
    cap0_batch = winoground[i:end_idx]["image_0"]
    cap1_batch = winoground[i:end_idx]["image_1"]
    cap0_inputs = processor(cap0_batch, return_tensors="pt", padding=True)
    cap1_inputs = processor(cap1_batch, return_tensors="pt", padding=True)

    cap0_features = model(**cap0_inputs).last_hidden_state.mean(dim=1)
    cap1_features = model(**cap1_inputs).last_hidden_state.mean(dim=1)

    cap0_features = torch.nn.functional.normalize(cap0_features, p=2, dim=1)
    cap1_features = torch.nn.functional.normalize(cap1_features, p=2, dim=1)

    cosine_similarities.extend(F.cosine_similarity(cap0_features, cap1_features).cpu().tolist())
    print(cosine_similarities[-1])

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vitb8 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


0.9989266395568848


In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM
import json

# Load the model and tokenizer
model_name = "roberta-base"
model = RobertaForMaskedLM.from_pretrained(model_name)
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model.eval()

def calculate_log_likelihood(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        log_likelihood = outputs.logits.gather(2, inputs["input_ids"].unsqueeze(-1)).sum().item()
    return log_likelihood
  

likelihood = {}  # To store max softmax probabilities for all images
count = 0
for sample in winoground:
    cap_0 = sample["caption_0"]
    cap_1 = sample["caption_1"]

    ll_cap_0 = calculate_log_likelihood(cap_0) / len(tokenizer.tokenize(cap_0))
    ll_cap_1 = calculate_log_likelihood(cap_1) / len(tokenizer.tokenize(cap_1))

    likelihood[count] = ll_cap_0
    count+=1
    likelihood[count] = ll_cap_1
    count+=1

text_likelihood = {}
for i in likelihood:
  text_likelihood[i] = str(f'{likelihood[i]:.3f}')

with open("text_likelihood.json", 'w') as f:
  json.dump(text_likelihood, f)