In [1]:
import pickle
import pandas as pd

import torch
import evaluate
from sentence_transformers import SentenceTransformer, util
from transformers import CLIPModel, AutoTokenizer, AutoProcessor

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"]="6"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gt_captions = pickle.load(open('processed_data/stimuli_original_captions.pkl', 'rb'))
# same for all subjects
test_ids = pickle.load(open('processed_data/subj01/sig_test_sub1.pkl', 'rb')).keys()

# select only the test captions
gt_captions = [gt_captions[idx] for idx in test_ids]

In [3]:
def load_pickle(path):
    return pickle.load(open(path, 'rb'))

# Meteor

In [4]:
meteor = evaluate.load('meteor')

scores = []

for sub in [1, 2, 5, 7]:
    # dinov2 captions
    # its same for all subjects 
    dinov2_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_dinov2_captions.pkl')
    
    # linear model [Brain Diffuser]
    linear_captions = load_pickle(f'results/sub0{sub}_linear_w_beam_fmri_captions.pkl')

    # CNN wide
    widecnn_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_fmri_captions.pkl')
    
    # CNN shallow
    shallowcnn_captions = load_pickle(f'results/sub0{sub}_shallowcnn_w_beam_fmri_captions.pkl')


    scores.append({
        "sub": sub,
        "dino_vs_gt": meteor.compute(predictions=dinov2_captions, references=gt_captions)['meteor'],
        "linear_vs_gt": meteor.compute(predictions=linear_captions, references=gt_captions)['meteor'],
        "linear_vs_dino": meteor.compute(predictions=linear_captions, references=dinov2_captions)['meteor'],
        "widecnn_vs_gt": meteor.compute(predictions=widecnn_captions, references=gt_captions)['meteor'],
        "widecnn_vs_dino": meteor.compute(predictions=widecnn_captions, references=dinov2_captions)['meteor'],
        "shallowcnn_vs_gt": meteor.compute(predictions=shallowcnn_captions, references=gt_captions)['meteor'],
        "shallowcnn_vs_dino": meteor.compute(predictions=shallowcnn_captions, references=dinov2_captions)['meteor'],
    })


scores = pd.DataFrame(scores)
print("======SUB01 ONLY========")
print(scores[scores['sub']==1].mean())
print("======AVERAGED========")
for col in scores.columns:
    print(f'{col}: {scores[col].mean() :.5f} {scores[col].std() :.5f}')

[nltk_data] Downloading package wordnet to /home/guest/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/guest/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/guest/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


sub                   1.000000
dino_vs_gt            0.343938
linear_vs_gt          0.267075
linear_vs_dino        0.415223
widecnn_vs_gt         0.271113
widecnn_vs_dino       0.457303
shallowcnn_vs_gt      0.268387
shallowcnn_vs_dino    0.435663
dtype: float64
sub: 3.75000 2.75379
dino_vs_gt: 0.34394 0.00000
linear_vs_gt: 0.26291 0.00708
linear_vs_dino: 0.41363 0.01083
widecnn_vs_gt: 0.27345 0.00813
widecnn_vs_dino: 0.45694 0.01552
shallowcnn_vs_gt: 0.26669 0.00937
shallowcnn_vs_dino: 0.43647 0.02579


# Rouge-1

In [5]:
rouge = evaluate.load('rouge')

scores = []

for sub in [1, 2, 5, 7]:
    # dinov2 captions
    # its same for all subjects 
    dinov2_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_dinov2_captions.pkl')
    
    # linear model [Brain Diffuser]
    linear_captions = load_pickle(f'results/sub0{sub}_linear_w_beam_fmri_captions.pkl')

    # CNN wide
    widecnn_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_fmri_captions.pkl')
    
    # CNN shallow
    shallowcnn_captions = load_pickle(f'results/sub0{sub}_shallowcnn_w_beam_fmri_captions.pkl')


    scores.append({
        "sub": sub,
        "dino_vs_gt": rouge.compute(predictions=dinov2_captions, references=gt_captions)['rouge1'],
        "linear_vs_gt": rouge.compute(predictions=linear_captions, references=gt_captions)['rouge1'],
        "linear_vs_dino": rouge.compute(predictions=linear_captions, references=dinov2_captions)['rouge1'],
        "widecnn_vs_gt": rouge.compute(predictions=widecnn_captions, references=gt_captions)['rouge1'],
        "widecnn_vs_dino": rouge.compute(predictions=widecnn_captions, references=dinov2_captions)['rouge1'],
        "shallowcnn_vs_gt": rouge.compute(predictions=shallowcnn_captions, references=gt_captions)['rouge1'],
        "shallowcnn_vs_dino": rouge.compute(predictions=shallowcnn_captions, references=dinov2_captions)['rouge1'],
    })
    
    
scores = pd.DataFrame(scores)
print("======SUB01 ONLY========")
print(scores[scores['sub']==1].mean())
print("======AVERAGED========")
for col in scores.columns:
    print(f'{col}: {scores[col].mean() :.5f} {scores[col].std() :.5f}')

sub                   1.000000
dino_vs_gt            0.415151
linear_vs_gt          0.337494
linear_vs_dino        0.467228
widecnn_vs_gt         0.346206
widecnn_vs_dino       0.513230
shallowcnn_vs_gt      0.344890
shallowcnn_vs_dino    0.491798
dtype: float64
sub: 3.75000 2.75379
dino_vs_gt: 0.41515 0.00000
linear_vs_gt: 0.33197 0.00902
linear_vs_dino: 0.46236 0.01065
widecnn_vs_gt: 0.34682 0.00818
widecnn_vs_dino: 0.50934 0.01245
shallowcnn_vs_gt: 0.34008 0.00893
shallowcnn_vs_dino: 0.49201 0.02399


# Rouge-L

In [6]:
rouge = evaluate.load('rouge')

scores = []

for sub in [1, 2, 5, 7]:
    # dinov2 captions
    # its same for all subjects 
    dinov2_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_dinov2_captions.pkl')
    
    # linear model [Brain Diffuser]
    linear_captions = load_pickle(f'results/sub0{sub}_linear_w_beam_fmri_captions.pkl')

    # CNN wide
    widecnn_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_fmri_captions.pkl')
    
    # CNN shallow
    shallowcnn_captions = load_pickle(f'results/sub0{sub}_shallowcnn_w_beam_fmri_captions.pkl')


    scores.append({
        "sub": sub,
        "dino_vs_gt": rouge.compute(predictions=dinov2_captions, references=gt_captions)['rougeL'],
        "linear_vs_gt": rouge.compute(predictions=linear_captions, references=gt_captions)['rougeL'],
        "linear_vs_dino": rouge.compute(predictions=linear_captions, references=dinov2_captions)['rougeL'],
        "widecnn_vs_gt": rouge.compute(predictions=widecnn_captions, references=gt_captions)['rougeL'],
        "widecnn_vs_dino": rouge.compute(predictions=widecnn_captions, references=dinov2_captions)['rougeL'],
        "shallowcnn_vs_gt": rouge.compute(predictions=shallowcnn_captions, references=gt_captions)['rougeL'],
        "shallowcnn_vs_dino": rouge.compute(predictions=shallowcnn_captions, references=dinov2_captions)['rougeL'],
    })
    
    
scores = pd.DataFrame(scores)
print("======SUB01 ONLY========")
print(scores[scores['sub']==1].mean())
print("======AVERAGED========")
for col in scores.columns:
    print(f'{col}: {scores[col].mean() :.5f} {scores[col].std() :.5f}')

sub                   1.000000
dino_vs_gt            0.374883
linear_vs_gt          0.306171
linear_vs_dino        0.440812
widecnn_vs_gt         0.315513
widecnn_vs_dino       0.491127
shallowcnn_vs_gt      0.316554
shallowcnn_vs_dino    0.469370
dtype: float64
sub: 3.75000 2.75379
dino_vs_gt: 0.37488 0.00000
linear_vs_gt: 0.30107 0.00748
linear_vs_dino: 0.43779 0.00965
widecnn_vs_gt: 0.31729 0.00724
widecnn_vs_dino: 0.48818 0.01206
shallowcnn_vs_gt: 0.31216 0.00870
shallowcnn_vs_dino: 0.46933 0.02624


# Sentence transformer

In [7]:
scores = []
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

for sub in [1, 2, 5, 7]:
    # dinov2 captions
    # its same for all subjects 
    dinov2_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_dinov2_captions.pkl')
    
    # linear model [Brain Diffuser]
    linear_captions = load_pickle(f'results/sub0{sub}_linear_w_beam_fmri_captions.pkl')

    # CNN wide
    widecnn_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_fmri_captions.pkl')
    
    # CNN shallow
    shallowcnn_captions = load_pickle(f'results/sub0{sub}_shallowcnn_w_beam_fmri_captions.pkl')
    
    with torch.no_grad():
        # select first caption
        embedding_gt = sentence_model.encode([i[0] for i in gt_captions], convert_to_tensor=True)
        
        embedding_dinov2 = sentence_model.encode(dinov2_captions, convert_to_tensor=True)
        embedding_linear = sentence_model.encode(linear_captions, convert_to_tensor=True)
        embedding_widecnn = sentence_model.encode(widecnn_captions, convert_to_tensor=True)
        embedding_shallowcnn = sentence_model.encode(shallowcnn_captions, convert_to_tensor=True)

        scores.append({
            "sub": sub,
            "dino_vs_gt": util.pytorch_cos_sim(embedding_dinov2, embedding_gt).diag().mean().item(),
            "linear_vs_gt": util.pytorch_cos_sim(embedding_linear, embedding_gt).diag().mean().item(),
            "linear_vs_dino": util.pytorch_cos_sim(embedding_linear, embedding_dinov2).diag().mean().item(),
            "widecnn_vs_gt": util.pytorch_cos_sim(embedding_widecnn, embedding_gt).diag().mean().item(),
            "widecnn_vs_dino": util.pytorch_cos_sim(embedding_widecnn, embedding_dinov2).diag().mean().item(),
            "shallowcnn_vs_gt": util.pytorch_cos_sim(embedding_shallowcnn, embedding_gt).diag().mean().item(),
            "shallowcnn_vs_dino": util.pytorch_cos_sim(embedding_shallowcnn, embedding_dinov2).diag().mean().item(),
        })

scores = pd.DataFrame(scores)
print("======SUB01 ONLY========")
print(scores[scores['sub']==1].mean())
print("======AVERAGED========")
for col in scores.columns:
    print(f'{col}: {scores[col].mean() :.5f} {scores[col].std() :.5f}')

sub                   1.000000
dino_vs_gt            0.577794
linear_vs_gt          0.358235
linear_vs_dino        0.428488
widecnn_vs_gt         0.396886
widecnn_vs_dino       0.498551
shallowcnn_vs_gt      0.372515
shallowcnn_vs_dino    0.463180
dtype: float64
sub: 3.75000 2.75379
dino_vs_gt: 0.57779 0.00000
linear_vs_gt: 0.34921 0.01518
linear_vs_dino: 0.42038 0.01865
widecnn_vs_gt: 0.38912 0.02196
widecnn_vs_dino: 0.48418 0.02767
shallowcnn_vs_gt: 0.36710 0.02537
shallowcnn_vs_dino: 0.45750 0.03300


# CLIP-B

In [8]:
scores = []

model_clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer =  AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

for sub in [1, 2, 5, 7]:
    # dinov2 captions
    # its same for all subjects 
    dinov2_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_dinov2_captions.pkl')
    
    # linear model [Brain Diffuser]
    linear_captions = load_pickle(f'results/sub0{sub}_linear_w_beam_fmri_captions.pkl')

    # CNN wide
    widecnn_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_fmri_captions.pkl')
    
    # CNN shallow
    shallowcnn_captions = load_pickle(f'results/sub0{sub}_shallowcnn_w_beam_fmri_captions.pkl')
    
    with torch.no_grad():
        # select first caption
        embedding_gt = model_clip.get_text_features(**tokenizer([i[0] for i in gt_captions],return_tensors="pt",padding=True))
        
        embedding_dinov2 = model_clip.get_text_features(**tokenizer(dinov2_captions,return_tensors="pt",padding=True))
        embedding_linear = model_clip.get_text_features(**tokenizer(linear_captions,return_tensors="pt",padding=True))
        embedding_widecnn = model_clip.get_text_features(**tokenizer(widecnn_captions,return_tensors="pt",padding=True))
        embedding_shallowcnn = model_clip.get_text_features(**tokenizer(shallowcnn_captions,return_tensors="pt",padding=True))
        
        scores.append({
            "sub": sub,
            "dino_vs_gt": util.pytorch_cos_sim(embedding_dinov2, embedding_gt).diag().mean().item(),
            "linear_vs_gt": util.pytorch_cos_sim(embedding_linear, embedding_gt).diag().mean().item(),
            "linear_vs_dino": util.pytorch_cos_sim(embedding_linear, embedding_dinov2).diag().mean().item(),
            "widecnn_vs_gt": util.pytorch_cos_sim(embedding_widecnn, embedding_gt).diag().mean().item(),
            "widecnn_vs_dino": util.pytorch_cos_sim(embedding_widecnn, embedding_dinov2).diag().mean().item(),
            "shallowcnn_vs_gt": util.pytorch_cos_sim(embedding_shallowcnn, embedding_gt).diag().mean().item(),
            "shallowcnn_vs_dino": util.pytorch_cos_sim(embedding_shallowcnn, embedding_dinov2).diag().mean().item(),
        })
    

scores = pd.DataFrame(scores)
print("======SUB01 ONLY========")
print(scores[scores['sub']==1].mean())
print("======AVERAGED========")
for col in scores.columns:
    print(f'{col}: {scores[col].mean() :.5f} {scores[col].std() :.5f}')

sub                   1.000000
dino_vs_gt            0.773245
linear_vs_gt          0.671410
linear_vs_dino        0.714764
widecnn_vs_gt         0.687283
widecnn_vs_dino       0.746361
shallowcnn_vs_gt      0.675333
shallowcnn_vs_dino    0.731151
dtype: float64
sub: 3.75000 2.75379
dino_vs_gt: 0.77325 0.00000
linear_vs_gt: 0.66732 0.00605
linear_vs_dino: 0.71140 0.00667
widecnn_vs_gt: 0.67788 0.01158
widecnn_vs_dino: 0.73604 0.01386
shallowcnn_vs_gt: 0.67217 0.01318
shallowcnn_vs_dino: 0.72788 0.01758


# CLIP-L

In [9]:
scores = []

model_clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
tokenizer =  AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14")

for sub in [1, 2, 5, 7]:
    # dinov2 captions
    # its same for all subjects 
    dinov2_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_dinov2_captions.pkl')
    
    # linear model [Brain Diffuser]
    linear_captions = load_pickle(f'results/sub0{sub}_linear_w_beam_fmri_captions.pkl')

    # CNN wide
    widecnn_captions = load_pickle(f'results/sub0{sub}_widecnn_w_beam_fmri_captions.pkl')
    
    # CNN shallow
    shallowcnn_captions = load_pickle(f'results/sub0{sub}_shallowcnn_w_beam_fmri_captions.pkl')
    
    with torch.no_grad():
        # select first caption
        embedding_gt = model_clip.get_text_features(**tokenizer([i[0] for i in gt_captions],return_tensors="pt",padding=True))
        
        embedding_dinov2 = model_clip.get_text_features(**tokenizer(dinov2_captions,return_tensors="pt",padding=True))
        embedding_linear = model_clip.get_text_features(**tokenizer(linear_captions,return_tensors="pt",padding=True))
        embedding_widecnn = model_clip.get_text_features(**tokenizer(widecnn_captions,return_tensors="pt",padding=True))
        embedding_shallowcnn = model_clip.get_text_features(**tokenizer(shallowcnn_captions,return_tensors="pt",padding=True))

        scores.append({
            "sub": sub,
            "dino_vs_gt": util.pytorch_cos_sim(embedding_dinov2, embedding_gt).diag().mean().item(),
            "linear_vs_gt": util.pytorch_cos_sim(embedding_linear, embedding_gt).diag().mean().item(),
            "linear_vs_dino": util.pytorch_cos_sim(embedding_linear, embedding_dinov2).diag().mean().item(),
            "widecnn_vs_gt": util.pytorch_cos_sim(embedding_widecnn, embedding_gt).diag().mean().item(),
            "widecnn_vs_dino": util.pytorch_cos_sim(embedding_widecnn, embedding_dinov2).diag().mean().item(),
            "shallowcnn_vs_gt": util.pytorch_cos_sim(embedding_shallowcnn, embedding_gt).diag().mean().item(),
            "shallowcnn_vs_dino": util.pytorch_cos_sim(embedding_shallowcnn, embedding_dinov2).diag().mean().item(),
        })
    
scores = pd.DataFrame(scores)
print("======SUB01 ONLY========")
print(scores[scores['sub']==1].mean())
print("======AVERAGED========")
for col in scores.columns:
    print(f'{col}: {scores[col].mean() :.5f} {scores[col].std() :.5f}')

sub                   1.000000
dino_vs_gt            0.692639
linear_vs_gt          0.562937
linear_vs_dino        0.633645
widecnn_vs_gt         0.584748
widecnn_vs_dino       0.673862
shallowcnn_vs_gt      0.571342
shallowcnn_vs_dino    0.657476
dtype: float64
sub: 3.75000 2.75379
dino_vs_gt: 0.69264 0.00000
linear_vs_gt: 0.55721 0.00799
linear_vs_dino: 0.62834 0.00908
widecnn_vs_gt: 0.57591 0.01311
widecnn_vs_dino: 0.66478 0.01555
shallowcnn_vs_gt: 0.56650 0.01668
shallowcnn_vs_dino: 0.65189 0.02274


# Some examples sorted by CLIP-L cosine distance

In [10]:
widecnn_captions = load_pickle(f'results/sub01_widecnn_w_beam_fmri_captions.pkl')

with torch.no_grad():
    embedding_widecnn = model_clip.get_text_features(**tokenizer(widecnn_captions,return_tensors="pt",padding=True))

In [11]:
_, ids = util.pytorch_cos_sim(embedding_widecnn, embedding_gt).diag().sort()

In [12]:
# good examples
for i in ids[-10:]:
    print('True caption: ', gt_captions[i])
    print('fMRI pred caption: ', widecnn_captions[i])
    print('Dinov2 pred caption: ', dinov2_captions[i])
    print('==================================')

True caption:  ['Two giraffes that are standing in the grass.', 'Two giraffes are standing together in the field.']
fMRI pred caption:  A couple of giraffes standing in a field.
Dinov2 pred caption:  A couple of giraffes standing in a field.
True caption:  ['Group of people standing on the side of a busy city street.']
fMRI pred caption:  A group of people walking on a city street.
Dinov2 pred caption:  A group of people that are standing in the street.
True caption:  ['A man on a surfboard in the ocean.']
fMRI pred caption:  A person on a surfboard on a body of water.
Dinov2 pred caption:  A person on a surfboard in the ocean.
True caption:  ['a bath room with a toilet a sink and a mirror']
fMRI pred caption:  A bathroom with a sink, toilet and sink.
Dinov2 pred caption:  A bathroom with a white toilet and a white sink.
True caption:  ['A man on a surf board riding a wave in the ocean.', 'A guy surfing on a wave in the ocean.']
fMRI pred caption:  A person on a surfboard in the ocean.

In [13]:
# bad examples
for i in ids[:10]:
    print('True caption: ', gt_captions[i])
    print('fMRI pred caption: ', widecnn_captions[i])
    print('Dinov2 pred caption: ', dinov2_captions[i])
    print('==================================')

True caption:  ['Green beans, purple beans and lavender arranged along with a scissor on a bamboo mat.']
fMRI pred caption:  a close up of a plate of food on a table.
Dinov2 pred caption:  a close up of a bottle of water on a table top.
True caption:  ['A woman in a tank top holds a rainbow colored umbrella.']
fMRI pred caption:  A black and white photo of a black and white dog.
Dinov2 pred caption:  a close up of a woman holding a cell phone in her left hand.
True caption:  ['A little boy in a blue shirt smiles and stands next to a fire hydrant.', 'A little boy is walking by a fire hydrant.']
fMRI pred caption:  A black and white cat laying on a couch.
Dinov2 pred caption:  A red fire hydrant in front of a building.
True caption:  ['A woman walks down the sidewalk in front of a red wall and a yellow fire hydrant.']
fMRI pred caption:  A dog that is sitting on the floor of a living room.
Dinov2 pred caption:  A black and white photo of a fire hydrant on the side of the street.
True cap