# SCORING: Tutorial

### libraries

In [23]:
import torch
import clip
from PIL import Image
from transformers import AutoProcessor, BlipModel

### Scoring Functions

The CLIP scoring code is taken from: [CLIP score](https://unimatrixz.com/blog/latent-space-clip-score/)

In [20]:
def get_clip_score(image_path, text):
# Load the pre-trained CLIP model and the image
    model, preprocess = clip.load('ViT-B/32')
    image = Image.open(image_path)

    # Preprocess the image and tokenize the text
    image_input = preprocess(image).unsqueeze(0)
    text_input = clip.tokenize([text])
    
    # Move the inputs to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    image_input = image_input.to(device)
    text_input = text_input.to(device)
    model = model.to(device)
    
    # Generate embeddings for the image and text
    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_input)
    
    # Normalize the features
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
    # Calculate the cosine similarity to get the CLIP score
    clip_score = torch.matmul(image_features, text_features.T).item()
    
    return clip_score

### BLIP scoring

The BLIP scoring code is taken from: [BLIP HF](https://huggingface.co/docs/transformers/en/model_doc/blip)

In [19]:
def get_blip_score(image_path, prompt):
# Load the pre-trained CLIP model and the image
    model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
    processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

    # Preprocess the image and tokenize the text
    image = Image.open(image_path)
    inputs = processor(
        text=prompt, images=image, return_tensors="pt", padding=True
        )
    
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    
    return logits_per_image

## CLIP scoring

In [21]:
image_path = "/home/safouane/Downloads/SRT2I/optimal_pairs4/1_elephant.png"
prompt = "Wide photo of two playful elephants, one spraying water using trunk, another playing with a ball, Morning, grasslands, herd of elephants visible in background, sharp, photography, Canon EOS R5, 24-70mm, f/4, natural lighting"

In [22]:
score = get_clip_score(image_path, prompt)
score

0.31494140625

In [24]:
get_blip_score(image_path, prompt)

Some weights of BlipModel were not initialized from the model checkpoint at Salesforce/blip-image-captioning-base and are newly initialized: ['logit_scale', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.0.attention.self.value.bias', 'text_model.encoder.layer.0.attention.self.value.weight', 'text_model.encoder.layer.0.crossattention.output.LayerNorm.bias', 'text_model.

tensor([[-0.9941]], grad_fn=<TBackward0>)

In [18]:
logits_per_image[0,0]

tensor(-1.0700, grad_fn=<SelectBackward0>)