In [1]:
import os
from PIL import Image
from tqdm.notebook import tqdm

import torch
import numpy as np

import open_clip

from pycocotools.coco import COCO

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

### Fetch COCO Captions and Images

In [3]:
caption_file = "../../LAION/Data/coco_captions/captions_train2017.json"
image_dir = "../../LAION/Data/coco_train2017/"

In [4]:
coco_captions = COCO(caption_file)
captions = [cap_dict["caption"] for cap_dict in coco_captions.anns.values()]
captions[:10]

loading annotations into memory...
Done (t=0.87s)
creating index...
index created!


['A bicycle replica with a clock as the front wheel.',
 'A room with blue walls and a white sink and door.',
 'A car that seems to be parked illegally behind a legally parked car',
 'A large passenger airplane flying through the air.',
 'There is a GOL plane taking off in a partly cloudy sky.',
 'Blue and white color scheme in a small bathroom.',
 'This is a blue and white bathroom with a wall sink and a lifesaver on the wall.',
 'A blue boat themed bathroom with a life preserver on the wall',
 'The bike has a clock as a tire.',
 'two cars parked on the sidewalk on the street']

### Load Model

In [8]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-H-14', pretrained='laion2b_s32b_b79k')
model.to(device)

tokenizer = open_clip.get_tokenizer('ViT-H-14')

### Create Caption Embeddings

In [8]:
def batch_encode_text(captions, batch_size=64):
    feats = []
    for i in tqdm(range(0, len(captions), batch_size)):
        with torch.no_grad():
            texts = tokenizer(captions[i:i+batch_size])
            text_features = model.encode_text(texts.to(device))
            text_features /= text_features.norm(dim=-1, keepdim=True)

        feats.append(text_features.detach().cpu())
    
    feat_tensor = torch.cat(feats, dim=0)
    np.save("./embeds/clip-H-14_text_coco_train17.npy", feat_tensor.numpy(), allow_pickle=True)

In [9]:
batch_encode_text(captions)

  0%|          | 0/9247 [00:00<?, ?it/s]

### Create Image Embeddings

In [9]:
preprocess(Image.open("C:\\Users\\Shivaen\\Documents\\Code\\LAION\\Data\\coco_train2017\\000000000072.jpg")).shape

torch.Size([3, 224, 224])

In [5]:
def batch_encode_img(image_dir, batch_size=64):
    image_tensors = []
    files = sorted(os.listdir(image_dir))

    for i in tqdm(range(0, len(os.listdir(image_dir)), batch_size)):
        images = []
        for filename in files[i:i+batch_size]: 
            images.append(preprocess(Image.open(os.path.join(image_dir, filename))).unsqueeze(0))
        
        with torch.no_grad():
            image_feats = model.encode_image(torch.cat(images, dim=0).to(device))
            image_feats /= image_feats.norm(dim=-1, keepdim=True)
            image_tensors.append(image_feats.detach().cpu())

    np.save("./embeds/clip-H-14_img_coco_train17.npy", torch.cat(image_tensors, dim=0).numpy(), allow_pickle=True)

In [6]:
batch_encode_img(image_dir, batch_size=32)

  0%|          | 0/3697 [00:00<?, ?it/s]