In [11]:
import torch
import numpy as np
import os
from tqdm import tqdm

### Create Dataset

In [15]:
data_dir = "/mnt/ssd/ronak/datasets/imagenet_captions_50k"

img = torch.from_numpy(torch.load(os.path.join(data_dir, "vit_b32_laion2b_image_features.pt")))
txt = torch.from_numpy(torch.load(os.path.join(data_dir, "vit_b32_laion2b_text_features.pt")))
idx = torch.load(os.path.join(data_dir, "vit_b32_laion2b_idx.pt"))

In [16]:
# data should be prenormalized
np.linalg.norm(img[:10], axis=1)

array([1.        , 1.        , 1.0000001 , 0.99999994, 1.        ,
       1.        , 1.        , 0.99999994, 1.        , 1.        ],
      dtype=float32)

In [17]:
n = len(img)
np.random.seed(123)
test_size = int(0.1 * n)

train_idx = np.random.choice(n, size=(n - test_size), replace=False)
test_idx = np.delete(np.arange(n), train_idx)
assert len(np.intersect1d(train_idx, test_idx)) == 0
assert len(np.union1d(train_idx, test_idx)) == n

In [18]:
train_idx = torch.from_numpy(train_idx)
test_idx = torch.from_numpy(test_idx)

x_train, x_test = img[train_idx], img[test_idx]
y_train, y_test = txt[train_idx], txt[test_idx]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

torch.Size([43422, 512])
torch.Size([43422, 512])
torch.Size([4824, 512])
torch.Size([4824, 512])


In [19]:
torch.save(x_train, os.path.join(data_dir, "x_train.pt"))
torch.save(y_train, os.path.join(data_dir, "y_train.pt"))
torch.save(x_test, os.path.join(data_dir, "x_test.pt"))
torch.save(y_test, os.path.join(data_dir, "y_test.pt"))

### Compute Baseline Loss

In [29]:
def compute_batch_loss(x, y, scale=100.0):
    # scale is taken from vit_b32_laion2b model
    logits = torch.matmul(x, y.T) * scale
    labels = torch.arange(len(logits))
    loss_i = torch.nn.functional.cross_entropy(logits, labels)
    loss_t = torch.nn.functional.cross_entropy(logits.T, labels)
    loss = (loss_i + loss_t) / 2
    return loss
        
def compute_loss(batch_size, X, Y, n_bootstrap=3000):
    N = len(X)
    np.random.seed(123)
    losses = torch.zeros(n_bootstrap)
    for i in tqdm(range(n_bootstrap)):
        idx = np.random.choice(N, size=(batch_size,), replace=False)
        losses[i] = compute_batch_loss(X[idx], Y[idx])
    return losses.mean()

model_name = 'vit-b32-laion2b_s34b_b79k'

In [33]:
batch_size = 64

train_loss = compute_loss(batch_size, x_train, y_train)
test_loss = compute_loss(batch_size, x_test, y_test)

print(f"model:        {model_name}")
print(f"batch size:   {batch_size}")
print(f"  train loss: {train_loss:0.5f}")
print(f"  test loss:  {test_loss:0.5f}")

100%|██████████| 3000/3000 [00:02<00:00, 1064.92it/s]
100%|██████████| 3000/3000 [00:01<00:00, 2705.53it/s]

model:        vit-b32-laion2b_s34b_b79k
batch size:   64
  train loss: 2.28431
  test loss:  2.34897





In [30]:
batch_size = 128

train_loss = compute_loss(batch_size, x_train, y_train)
test_loss = compute_loss(batch_size, x_test, y_test)

print(f"model:        {model_name}")
print(f"batch size:   {batch_size}")
print(f"  train loss: {train_loss:0.5f}")
print(f"  test loss:  {test_loss:0.5f}")

100%|██████████| 3000/3000 [00:03<00:00, 920.23it/s] 
100%|██████████| 3000/3000 [00:01<00:00, 2351.16it/s]

model:        vit-b32-laion2b_s34b_b79k
batch size:   128
  train loss: 2.97814
  test loss:  3.04914





In [31]:
batch_size = 256

train_loss = compute_loss(batch_size, x_train, y_train)
test_loss = compute_loss(batch_size, x_test, y_test)

print(f"model:        {model_name}")
print(f"batch size:   {batch_size}")
print(f"  train loss: {train_loss:0.5f}")
print(f"  test loss:  {test_loss:0.5f}")

100%|██████████| 3000/3000 [00:03<00:00, 778.94it/s]
100%|██████████| 3000/3000 [00:02<00:00, 1416.96it/s]

model:        vit-b32-laion2b_s34b_b79k
batch size:   256
  train loss: 3.72472
  test loss:  3.79841





In [32]:
batch_size = 512

train_loss = compute_loss(batch_size, x_train, y_train)
test_loss = compute_loss(batch_size, x_test, y_test)

print(f"model:        {model_name}")
print(f"batch size:   {batch_size}")
print(f"  train loss: {train_loss:0.5f}")
print(f"  test loss:  {test_loss:0.5f}")

100%|██████████| 3000/3000 [00:07<00:00, 407.67it/s]
100%|██████████| 3000/3000 [00:05<00:00, 521.28it/s]

model:        vit-b32-laion2b_s34b_b79k
batch size:   512
  train loss: 4.48539
  test loss:  4.56147



