## Download and preprocess the winoground dataset

In [None]:
!pip install transformers
!pip install datasets

In [None]:
from datasets import load_dataset
from torchvision import transforms
from torch.utils.data import DataLoader
from PIL import Image
import torch
import numpy
import matplotlib.pyplot as plt
from tqdm import tqdm

auth_token = "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"  # Replace with an auth token, which you can get from your huggingface account: Profile -> Settings -> Access Tokens -> New Token
winoground = load_dataset("facebook/winoground", use_auth_token=auth_token)["test"]

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize
    transforms.ToTensor()           # Convert to PyTorch tensor
])

def transform_wino(examples):
    examples["image_0"] = [transform(image.convert("RGB")) for image in examples["image_0"]]
    examples["image_1"] = [transform(image.convert("RGB")) for image in examples["image_1"]]
    return examples

winoground.set_transform(transform_wino)

## Calculate cosine similarity of captions using sentence transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
import json
model = SentenceTransformer('all-mpnet-base-v2')

# Two lists of sentences
captions0 = []
captions1 = []

for sample in winoground:
    captions0.append(sample["caption_0"])
    captions1.append(sample["caption_1"])

# Compute embedding for both lists
embeddings1 = model.encode(captions0, convert_to_tensor=True)
embeddings2 = model.encode(captions1, convert_to_tensor=True)

# Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

sentence_similarity = {}
count = 0
# Output the pairs with their score
for i in range(len(captions0)):
    sentence_similarity[count] = float(cosine_scores[i][i].numpy())
    count += 1
    sentence_similarity[count] = float(cosine_scores[i][i].numpy())
    count += 1

In [None]:
with open("sentence-transformers-similarity.json", 'w') as f:
  json.dump(sentence_similarity, f)

## Calculate perplexity of captions using GPT-2

In [None]:
import torch
import json
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def get_sentence_log_probability(sentence, model, tokenizer):
    input_ids = tokenizer.encode(sentence, return_tensors="pt")

    # Get model's predictions
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits[:, :-1, :]

    # Get log probabilities
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    predicted_log_probs = torch.gather(log_probs, 2, input_ids[:, 1:].unsqueeze(-1)).squeeze()

    # Sum the log probabilities for the sentence
    sentence_log_prob = predicted_log_probs.sum().item()

    return sentence_log_prob

def get_sentence_perplexity(sentence, model, tokenizer):
    log_prob = get_sentence_log_probability(sentence, model, tokenizer)
    N = len(tokenizer.encode(sentence))  # Number of tokens in the sentence
    perplexity = pow(2, -log_prob/N)
    return perplexity

# Load GPT-2 model and tokenizer
model_name = "gpt2-medium"
model = GPT2LMHeadModel.from_pretrained(model_name).eval()
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

perplexity = {}
count=0
# Get sentence perplexity
for sample in winoground:
    caption0 = sample["caption_0"]
    caption1 = sample["caption_1"]

    perplexity0 = get_sentence_perplexity(caption0, model, tokenizer)
    perplexity1 = get_sentence_perplexity(caption1, model, tokenizer)

    perplexity[count] = perplexity0
    perplexity[count+1] = perplexity1

    count += 2

    print(count)

with open("gpt2_perplexity.json", 'w') as f:
    json.dump(perplexity, f)

## Calculate log likelihood of captions using RoBERTa Base

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM
import json

# Load the model and tokenizer
model_name = "roberta-base"
model = RobertaForMaskedLM.from_pretrained(model_name)
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model.eval()

def calculate_log_likelihood(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        log_likelihood = outputs.logits.gather(2, inputs["input_ids"].unsqueeze(-1)).sum().item()
    return log_likelihood
  

likelihood = {}  # To store max softmax probabilities for all images
count = 0
for sample in winoground:
    cap_0 = sample["caption_0"]
    cap_1 = sample["caption_1"]

    ll_cap_0 = calculate_log_likelihood(cap_0) / len(tokenizer.tokenize(cap_0))
    ll_cap_1 = calculate_log_likelihood(cap_1) / len(tokenizer.tokenize(cap_1))

    likelihood[count] = ll_cap_0
    count+=1
    likelihood[count] = ll_cap_1
    count+=1

text_likelihood = {}
for i in likelihood:
  text_likelihood[i] = str(f'{likelihood[i]:.3f}')

with open("roberta_likelihood.json", 'w') as f:
  json.dump(text_likelihood, f)