In [4]:
import os
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel

#Loading SciBERT
scibert_tokenizer=AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
scibert_model =AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

#Loadi9ng CLIP
clip_model =CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor= CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")




In [5]:
with open("../data/captions.json") as f:
    captions=json.load(f)

os.makedirs("../data/embeddings", exist_ok=True)

for filename, caption in tqdm(captions.items()):
    tokens=scibert_tokenizer(caption, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        embedding=scibert_model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()
    np.save(f"../data/embeddings/{filename}_text.npy", embedding)


100%|███████████████████████████████████████| 3685/3685 [01:19<00:00, 46.39it/s]


In [7]:
image_dir="../data/images"
output_dir="../data/embeddings"
os.makedirs(output_dir, exist_ok=True)
for filename in tqdm(os.listdir(image_dir)):
    if not filename.endswith(".png"):
        continue
    image_path=os.path.join(image_dir, filename)
    try:
        image =Image.open(image_path).convert("RGB")
        inputs=clip_processor(images=image, return_tensors="pt")
        with torch.no_grad():
            features =clip_model.get_image_features(**inputs)

        #function to ensure correct shaping 
        if features.dim() == 2 and features.shape[0] == 1:
            embedding =features[0].numpy()
        elif features.dim() ==1:
            embedding = features.numpy()
        else:
            raise ValueError(f"Unexpected feature shape:{features.shape}")

        np.save(os.path.join(output_dir, f"{filename}_image.npy"),embedding)
        
    except Exception as e:
        print(f"[⚠️] Failed on {filename}:{e}")


  5%|██                                      | 337/6463 [00:15<04:23, 23.27it/s]

[⚠️] Failed on 2507.13334v1_fig38.png: mean must have 1 elements if it is an iterable, got 3


  6%|██▌                                     | 410/6463 [00:19<05:12, 19.36it/s]

[⚠️] Failed on 2507.12565v1_fig16.png: mean must have 1 elements if it is an iterable, got 3


 51%|████████████████████                   | 3321/6463 [02:48<02:50, 18.42it/s]

[⚠️] Failed on 2507.13334v1_fig40.png: mean must have 1 elements if it is an iterable, got 3


 88%|██████████████████████████████████▏    | 5674/6463 [04:49<00:41, 18.92it/s]

[⚠️] Failed on 2507.12565v1_fig14.png: mean must have 1 elements if it is an iterable, got 3


100%|███████████████████████████████████████| 6463/6463 [05:32<00:00, 19.42it/s]
