# Multimodal Sentiment Analysis

###Overview
- This project performs sentiment analysis on MASAD, an open-source dataset containing ~38,000 text/image pairs.
- We preprocess the data into cleaned text and augmented images and then load the data.
- Our architecture is comprised of a transformer encoder, keyless attention to weight each modality, and a classification head.
- The test accuracy of our model is roughly 94% and our F1 score is also roughly 94%, comparable to the model used in the original MASAD paper.




---

###Instructions to Run Project
1. Download the following datasets: MASAD, MASAD_SUBSET
2. Create shortcut for both datasets in google drive

In [None]:
!pip install tqdm
!pip install tensorboard

In [None]:
import os
import pandas as pd
from PIL import Image
from torchvision import transforms
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor
import re
import glob
import sys
import sklearn
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm
import pickle
from transformers import get_linear_schedule_with_warmup

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print(f"model will run on {device}")

model will run on cuda


# Configuration parameters

In [None]:
# Set configs for program (e.g. run w/ subset of data for faster processing & debugging)
use_data_subset = True
extract_text_bool = True
generate_clip_embeddings = True

In [None]:
def set_max_cell_output_height(max_height=200):
    """ Helper function that visually limits the size of cell output blocks to reduce the amoutn of
        scrolling required for code block output
        Unfortuantely this cannot be set globally -- call in any functions which print long outputs
    """
    from IPython.display import Javascript
    display(Javascript(f'''google.colab.output.setIframeHeight(0, true, {{maxHeight: {max_height}}})'''))

# Preprocessing Data


###Accessing File Paths to Read & Process Data

In [None]:
if use_data_subset:
  root_dir = '/content/drive/MyDrive/MASAD_SUBSET'
else:
  root_dir = '/content/drive/MyDrive/MASAD/'

###Preprocessing: Text only

In [None]:
# Goal: Create a .csv with filepaths for both types and labels
# We need:
# 1. im paths
# 2. text paths
# 3. labels
image_paths = []
text_paths = []
labels = []

splits = ['train', 'test']
sentiments = ['negative', 'positive']

Step 1: Create a basic .csv where each row contains a valid image filepath in Google Drive, its corresponding valid text filepath, and the sentiment label of the pair.

In [None]:
# Function to check if a file is a valid image or text file
# Attempt 1 at circumventing any data formatting issues while loading
def is_valid_file(file_path, extension):
    return os.path.isfile(file_path) and not os.path.basename(file_path).startswith('.') and file_path.lower().endswith(extension)

In [None]:
def ensure_valid_data():

  # For dirs...
  for split in splits:
      for sentiment in sentiments:

          # Need to set paths for each pair given
          image_modality_path = os.path.join(root_dir, split, 'image', sentiment)
          text_modality_path = os.path.join(root_dir, split, 'text', sentiment)

          # Used for debugging missing ims in early stages
          # Checks and prints missing paths
          # This function may be responsible for bugs in training but unsure
          if not os.path.isdir(image_modality_path):
              print(f"Missing image modality directory: {image_modality_path}")
              continue
          if not os.path.isdir(text_modality_path):
              print(f"Missing text modality directory: {text_modality_path}")
              continue

          # Use glob to find all .jpg files
          # glob handles all of the logic here
          image_pattern = os.path.join(image_modality_path, '**', '*.jpg')
          found_images = glob.glob(image_pattern, recursive=True)

          for image_file in found_images:
              # Create a text path given image
              # Create valid path
              relative_image_path = os.path.relpath(image_file, os.path.join(root_dir, split, 'image', sentiment))
              text_file = os.path.join(root_dir, split, 'text', sentiment, relative_image_path)
              text_file = os.path.splitext(text_file)[0] + '.txt'

              # Check if the text file exists
              if is_valid_file(text_file, '.txt'):
                  image_paths.append(image_file)
                  text_paths.append(text_file)
                  labels.append(sentiment)
              else:
                  print(f"Missing text file for image: {image_file}")

  # Creat basic df
  data = {
      'image_path': image_paths,
      'text_path': text_paths,
      'label': labels
  }

  df = pd.DataFrame(data)


  # Save df
  target_dir = '/content/drive/MyDrive/computer_vision_final_project/'
  os.makedirs(target_dir, exist_ok=True)
  if use_data_subset:
      df_save_path = os.path.join(target_dir, 'MASAD_processed_subset.csv')
  else:
    df_save_path = os.path.join(target_dir, 'MASAD_processed.csv')
  df.to_csv(df_save_path, index=False)
  print(f"\nDataFrame saved at {df_save_path}")

Load basic filepaths .csv

In [None]:
if use_data_subset:
    df_save_path = '/content/drive/MyDrive/computer_vision_final_project/MASAD_processed_subset.csv'
else:
  df_save_path = '/content/drive/MyDrive/computer_vision_final_project/MASAD_processed.csv'

# Load df
df = pd.read_csv(df_save_path)
print(df.head())

Step 2: Retrieve the raw text for each row and 'raw_text' col to df.

In [None]:
tqdm.pandas()

def extract_text(text_file):
    try:
        with open(text_file, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except Exception as e:
        # just return empty if not findable
        return ""

# Added progress bar with progress_apply because this takes forever
if extract_text_bool:
  print(f'value of extract text: {extract_text}')
  df['raw_text'] = df['text_path'].progress_apply(extract_text)
else:
  print(f'extract text if FALSE, so reading in the pickled df from MASAD_processed_clean_text.pkl')
  df = pd.read_pickle('/content/drive/MyDrive/computer_vision_final_project/MASAD_processed_clean_text.pkl')


# Display the first few entries
print(df[['text_path', 'raw_text']].head())

In [None]:
if use_data_subset:
  save_path_raw = '/content/drive/MyDrive/computer_vision_final_project/MASAD_processed_raw_text_subset.pkl'
else:
  save_path_raw = '/content/drive/MyDrive/computer_vision_final_project/MASAD_processed_raw_text.pkl'

# save df
df.to_pickle(save_path_raw)
print(f"DataFrame with raw text saved at {save_path_raw}")

DataFrame with raw text saved at /content/drive/MyDrive/computer_vision_final_project/MASAD_processed_raw_text.pkl


In [None]:
# Load the pickle file into a new df
loaded_df = pd.read_pickle(save_path_raw)

print(f"Loaded DataFrame shape: {loaded_df.shape}")
print(loaded_df.head())

Step 3: Clean raw text w/basic regex

In [None]:
def clean_text(text):
    if not text:
        return ""
    # Remove all weird tags from MASAD
    text = re.sub(r'<tag>\S*', '', text)
    # remove urls/links
    text = re.sub(r'http\S+', '', text)

    # replace hashtag w/space
    text = re.sub(r'(#\s*)+', ' ', text)

    # Remove special chars and nums
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    # Return here if buggy
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Use tqdm progress in case takes as long as extract
df['clean_text'] = df['raw_text'].progress_apply(clean_text)

# Display the first few entries
print(df.head())

### Embedding Extraction
Transform images, pass them into clip, extract embeddings for this step of our pre-processing

In [None]:
from transformers import CLIPImageProcessor, CLIPProcessor, CLIPImageProcessor, CLIPModel
import numpy as np
from transformers import CLIPImageProcessor, CLIPProcessor, CLIPTokenizerFast
from torchvision.transforms import v2
import torch.nn as nn
import torch.nn.functional as F
set_max_cell_output_height()
from PIL import Image, UnidentifiedImageError

def processed_im_and_text(df):
    im_list = []
    text_list = []
    im_paths = df['image_path'].tolist()
    indices_to_drop = []  # To keep track of corrupted image indices

    # Loop through image paths
    for i, img_path in enumerate(tqdm(im_paths, desc="Loading Images")):
        try:
            # Open and convert the image to RGB
            img = Image.open(img_path).convert("RGB")
            im_list.append(img)
            text_list.append(df.iloc[i]['clean_text'])
        except (UnidentifiedImageError, FileNotFoundError, OSError) as e:
            print(f"Warning: Skipping corrupted or unreadable image: {img_path} | Error: {e}")
            indices_to_drop.append(i)  # Mark this index for removal

    # Ensure image and text lists have the same length
    assert len(im_list) == len(text_list), "Mismatch between images and texts after processing."
    processed_inputs = None
    if len(im_list) != 0:
        processed_inputs = preprocess(text_list, im_list)
    return processed_inputs, indices_to_drop


def preprocess(texts, images):
    ''' Takes in a list of text and images, returns
    '''
    transforms = v2.Compose([
        v2.Resize(size=[224, 224]),
        v2.RandomResizedCrop(size=(224, 224), antialias=True),
        v2.RandomHorizontalFlip(p=0.2),
        v2.RandomRotation(90),
        v2.RandomZoomOut(p=0.3)])

    transformed_images = []
    for img in images:
        img = transforms(img)
        transformed_images.append(img)

    ## start of text token is T_CLS, start of image token is just CLS
    imageProcessor = CLIPImageProcessor(do_rescale=True, rescale_factor=1/255, do_normalize=True)
    tokenizer = CLIPTokenizerFast.from_pretrained('openai/clip-vit-base-patch16', bos_token='[T_CLS]')

    processor = CLIPProcessor(imageProcessor, tokenizer)
    inputs = processor(text=texts, images=transformed_images, truncation=True, return_tensors="pt", padding=True)
    return inputs

<IPython.core.display.Javascript object>

### data visualization functions to show original v.s. transformed images

In [None]:
import matplotlib.pyplot as plt

def visualize_a_few_images(images, labels, display_label=False):
  # apply data augmentation to a few images
  transforms = v2.Compose([
    v2.Resize(size=[224, 224]),
    v2.RandomResizedCrop(size=(224, 224), antialias=True),
    v2.RandomHorizontalFlip(p=0.2),
    v2.RandomRotation(90),
    v2.RandomZoomOut(p=0.3)])
  transformed_images = []
  for img in images:
      img = transforms(img)
      transformed_images.append(img)


  # plot each image and it's augmented version
  fig = plt.figure(figsize=(10, 7))
  plt.title('Sample Images and Transformations\n')
  plt.axis('off')

  for i in range(min(3, len(images))):
    im = images[i]
    t_im = transformed_images[i]

    plt.subplot(3, 2, 2*i+1)
    plt.imshow(im)
    plt.axis('off')  # Hide the axis labels
    if display_label:
      plt.title(f"Image {i+1} ({labels[i][0].upper()})") # 2*0+1=1, 2*1+1=3, 2*2+1=5
    else:
      plt.title(f"Image {i+1}") # 2*0+1=1, 2*1+1=3, 2*2+1=5

    plt.subplot(3, 2, 2*i+2) # 2*0+2=2, 2*1+2=4, 2*2+2=6
    plt.imshow(t_im)
    plt.axis('off')  # Hide the axis labels
    plt.title(f"Image {i+1} with Transformation")

  root_dir = '/content/drive/MyDrive/computer_vision_final_project/'
  plt.savefig(f'{root_dir}imgs_and_transform.png')
  plt.show()



In [None]:
def extract_hidden_states(df, clip_model, batch_size=32):

    all_stacked_clip_embeddings = []
    labels = []
    total_samples = len(df)
    num_batches = (total_samples + batch_size - 1) // batch_size  # Ceiling division
    total_indices_dropped = 0
    for batch_idx in tqdm(range(num_batches), desc="Extracting Hidden States"):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(df))
        batch_inputs, indices_to_drop = processed_im_and_text(df[start_idx:end_idx])

        if indices_to_drop:
          df = df.drop(indices_to_drop).reset_index(drop=True)
          end_idx -= len(indices_to_drop)
          print(f"Dropped {len(indices_to_drop)} corrupted images. New DataFrame size: {df.shape}")
          total_indices_dropped += len(indices_to_drop)
        if batch_inputs == None:
          continue

        # Slice the batch data
        inputs = {
            'input_ids': batch_inputs['input_ids'].to(device),
            'attention_mask': batch_inputs['attention_mask'].to(device),
            'pixel_values': batch_inputs['pixel_values'].to(device)
        }
        inputs = {k: v.to(device) for k, v in inputs.items()}

        ## at this point, the batch inputs are the output of the clip processor
        with torch.no_grad():
            outputs = clip_model(**inputs)

        # get embeds instead of hidden states, if changing model to work with the whole sentence/image instead of tokenized version
        # Extract hidden states
        text_embeddings = outputs.text_embeds
        img_embeddings = outputs.image_embeds

        valid_batch_size = end_idx - start_idx - len(indices_to_drop)
        encoded_labels = process_labels(df.iloc[start_idx:end_idx]["label"].to_list())
        labels.extend(encoded_labels)
        combined_hidden_states = torch.stack((text_embeddings, img_embeddings), dim=1)

        # Move to CPU and detach to free GPU memory
        all_stacked_clip_embeddings.append(combined_hidden_states.cpu())
        total_examples = sum(tensor.shape[0] for tensor in all_stacked_clip_embeddings)
        print(f"Processed {total_examples} examples so far.Labels size: {len(df)}")

        # Optional: Clear CUDA cache to free memory
        torch.cuda.empty_cache()

    torch.save(all_stacked_clip_embeddings, 'stacked_clip_embeddings_list.pt') # this is necessary in case the code breaks on torch.cat due to shape issues
    # ^ if it does, we can just read in the embeddings without having to process them thru clip from scratch

    final_labels = torch.tensor(labels, dtype=torch.float)
    try:
      final_hidden_states = torch.cat(all_stacked_clip_embeddings, dim=0)
      return final_hidden_states, final_labels
    except Exception as e:
      print(f'ERROR: Encounterd the folowing error when trying to concatenate all the clip embeddings; \n{e}')
      print(f'You can debug this by using torch.load("stacked_clip_embeddings_list.pt") to avoid having to re-process things w/ clip')


def process_labels(labels):
    """ Goes through all labels and converts from a list of strings to a pytorch tensor, 0 if negative, 1 is positive
    """
    label_map = {"negative": 0, "positive": 1}  # Adjust as per your actual label names
    encoded_labels = [label_map[label] for label in labels]
    return torch.tensor(encoded_labels, dtype=torch.float)




In [None]:
def save_model(model, save_path):
    """
    Saves the model's state_dict to the specified path.

    Args:
        model (nn.Module): The trained PyTorch model.
        save_path (str): The path where the model will be saved.
    """
    torch.save(model.state_dict(), save_path)
    print(f"Model weights saved to {save_path}")


# Main Pre-Processing Function

In [None]:
df = pd.read_pickle('/content/drive/MyDrive/computer_vision_final_project/MASAD_processed_clean_text.pkl')

In [None]:
target_rows_per_class = min(df['label'].value_counts())

df_small = df.groupby('label').sample(n=int(target_rows_per_class/1.5), random_state=42).reset_index(drop=True)
# Display the sizes of the original and scaled-down DataFrames
print(f"Original DataFrame size: {df.shape}")
print(f"Scaled-down DataFrame size: {df_small.shape}")
# (Optional) Verify the class distribution in the scaled-down DataFrame
print("\nClass distribution in scaled-down DataFrame:")
print(df_small['label'].value_counts())

Original DataFrame size: (38066, 5)
Scaled-down DataFrame size: (20136, 5)

Class distribution in scaled-down DataFrame:
label
negative    10068
positive    10068
Name: count, dtype: int64


### initialize cuda and clip model before pre-processing the data

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print(f"model will run on {device}")

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_model.eval()

for param in clip_model.parameters():
    param.requires_grad = False

model will run on cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

shapes into layer 1: 768, 512


Projection(
  (linear1): Linear(in_features=768, out_features=512, bias=False)
  (linear2): Linear(in_features=512, out_features=512, bias=False)
  (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (drop): Dropout(p=0.5, inplace=False)
)

In [None]:
generate_clip_embeddings =True

In [None]:
# Define the save directory on Google Drive
save_dir = '/content/drive/MyDrive/computer_vision_final_project/models/'
# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)
# Define the save path with a descriptive filename
embeddings_save_path = os.path.join(save_dir, 'mme_full')
labels_save_path = os.path.join(save_dir, 'labels_full')

if generate_clip_embeddings == True:
  mme,label_encodings = extract_hidden_states(df_small, clip_model) # mme is multimodal embedding

  print(f'successfully extracted the multimodal embeddings from CLIP with shape: {mme.shape}')
  print(f'label encodings shape: {label_encodings.shape}')

  # ☝🏽 modification 3:
  # the save model function is suitable for saving a model becaue it saves the model's state dict
  # but since our goal with the multimodal embeddings and labels is just to save them to a .pt file --
  # since they are state-less embeddings that don't have 'state_dict's, we just use torch.save() to
  # serialize them to a file
  torch.save(mme, embeddings_save_path)
  torch.save(label_encodings, labels_save_path)
  print(f"Embeddings saved at {embeddings_save_path}")
  print(f"Labels saved at {labels_save_path}")

else:
  print(f"generate_clip_embeddings is {generate_clip_embeddings}, so we'll just read in the embeddings and labels from their saved .pt files: \n{embeddings_save_path} and \n{labels_save_path}")
  mme = torch.load(embeddings_save_path)
  label_encodings = torch.load(labels_save_path)



num_batches 630
total_samples 20136


Extracting Hidden States:   0%|          | 0/630 [00:00<?, ?it/s]

end_idx 32



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  38%|███▊      | 12/32 [00:00<00:00, 117.04it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 128.98it/s]
Extracting Hidden States:   0%|          | 1/630 [00:01<10:50,  1.03s/it]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 32 examples so far.Labels size: 20136
end_idx 64



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  50%|█████     | 16/32 [00:00<00:00, 156.12it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 143.63it/s]
Extracting Hidden States:   0%|          | 2/630 [00:01<10:13,  1.02it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 64 examples so far.Labels size: 20136
end_idx 96



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  38%|███▊      | 12/32 [00:00<00:00, 112.65it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 124.47it/s]
Extracting Hidden States:   0%|          | 3/630 [00:02<10:00,  1.04it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 96 examples so far.Labels size: 20136
end_idx 128



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  50%|█████     | 16/32 [00:00<00:00, 153.16it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 141.96it/s]
Extracting Hidden States:   1%|          | 4/630 [00:03<09:54,  1.05it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 128 examples so far.Labels size: 20136
end_idx 160



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  47%|████▋     | 15/32 [00:00<00:00, 142.09it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 128.65it/s]
Extracting Hidden States:   1%|          | 5/630 [00:04<10:22,  1.00it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 160 examples so far.Labels size: 20136
end_idx 192



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  41%|████      | 13/32 [00:00<00:00, 123.77it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 124.90it/s]
Extracting Hidden States:   1%|          | 6/630 [00:06<10:55,  1.05s/it]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 192 examples so far.Labels size: 20136
end_idx 224



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  38%|███▊      | 12/32 [00:00<00:00, 115.72it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 107.84it/s]
Extracting Hidden States:   1%|          | 7/630 [00:07<11:25,  1.10s/it]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 224 examples so far.Labels size: 20136
end_idx 256



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  41%|████      | 13/32 [00:00<00:00, 124.07it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 115.50it/s]
Extracting Hidden States:   1%|▏         | 8/630 [00:08<11:20,  1.09s/it]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 256 examples so far.Labels size: 20136
end_idx 288



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  47%|████▋     | 15/32 [00:00<00:00, 148.30it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 137.46it/s]
Extracting Hidden States:   1%|▏         | 9/630 [00:09<10:56,  1.06s/it]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 288 examples so far.Labels size: 20136
end_idx 320



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  38%|███▊      | 12/32 [00:00<00:00, 119.32it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 130.36it/s]
Extracting Hidden States:   2%|▏         | 10/630 [00:10<10:35,  1.03s/it]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 320 examples so far.Labels size: 20136
end_idx 352



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  47%|████▋     | 15/32 [00:00<00:00, 142.97it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 144.32it/s]
Extracting Hidden States:   2%|▏         | 11/630 [00:11<10:09,  1.02it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 352 examples so far.Labels size: 20136
end_idx 384



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  47%|████▋     | 15/32 [00:00<00:00, 143.68it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 127.53it/s]
Extracting Hidden States:   2%|▏         | 12/630 [00:12<10:08,  1.02it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 384 examples so far.Labels size: 20136
end_idx 416



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  44%|████▍     | 14/32 [00:00<00:00, 133.35it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 135.62it/s]
Extracting Hidden States:   2%|▏         | 13/630 [00:13<09:57,  1.03it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 416 examples so far.Labels size: 20136
end_idx 448



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  47%|████▋     | 15/32 [00:00<00:00, 139.79it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 126.79it/s]
Extracting Hidden States:   2%|▏         | 14/630 [00:14<09:56,  1.03it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 448 examples so far.Labels size: 20136
end_idx 480



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  47%|████▋     | 15/32 [00:00<00:00, 148.42it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 140.78it/s]
Extracting Hidden States:   2%|▏         | 15/630 [00:15<09:50,  1.04it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 480 examples so far.Labels size: 20136
end_idx 512



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  44%|████▍     | 14/32 [00:00<00:00, 136.00it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 131.69it/s]
Extracting Hidden States:   3%|▎         | 16/630 [00:16<10:04,  1.02it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 512 examples so far.Labels size: 20136
end_idx 544



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  38%|███▊      | 12/32 [00:00<00:00, 119.68it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 123.83it/s]
Extracting Hidden States:   3%|▎         | 17/630 [00:17<09:59,  1.02it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 544 examples so far.Labels size: 20136
end_idx 576



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  47%|████▋     | 15/32 [00:00<00:00, 144.39it/s][A
Loading Images: 100%|██████████| 32/32 [00:00<00:00, 134.30it/s]
Extracting Hidden States:   3%|▎         | 18/630 [00:17<09:58,  1.02it/s]

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 576 examples so far.Labels size: 20136
end_idx 608



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:  31%|███▏      | 10/32 [00:00<00:00, 94.54it/s][A
Loading Images: 100%|██████████| 32/32 [00:04<00:00,  7.67it/s] 


clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 608 examples so far.Labels size: 20136


Extracting Hidden States:   3%|▎         | 19/630 [00:22<22:05,  2.17s/it]

end_idx 640



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:23,  1.34it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:26,  1.12it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.07it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.04it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.04it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.12it/s][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:31,  1.27s/it][A
Loading Images:  25%|██▌       | 8/32 [00:08<00:28,  1.17s/it][A
Loading Images:  28%|██▊       | 9/32 [00:09<00:25,  1.11s/it][A
Loading Images:  31%|███▏      | 10/32 [00:10<00:23,  1.05s/it][A
Loading Images:  34%|███▍      | 11/32 [00:11<00:22,  1.05s/it][A
Loading Images:  38%|███▊      | 12/32 [00:12<00:20,  1.01s/it][A
Loading Images:  41%|████      | 13/32 [00:13<00:19,  1.00s/it][A
Loading Images:  44%|████▍     | 14/32 [00:14<00:18,  1.02s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 640 examples so far.Labels size: 20136
end_idx 672



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:31,  1.02s/it][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.05it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:28,  1.01it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.02it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:26,  1.03it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:26,  1.01s/it][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.02it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:23,  1.02it/s][A
Loading Images:  28%|██▊       | 9/32 [00:09<00:25,  1.10s/it][A
Loading Images:  31%|███▏      | 10/32 [00:10<00:23,  1.06s/it][A
Loading Images:  34%|███▍      | 11/32 [00:11<00:22,  1.05s/it][A
Loading Images:  38%|███▊      | 12/32 [00:12<00:20,  1.04s/it][A
Loading Images:  41%|████      | 13/32 [00:13<00:18,  1.01it/s][A
Loading Images:  44%|████▍     | 14/32 [00:14<00:18,  1.03s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 672 examples so far.Labels size: 20136
end_idx 704



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:27,  1.14it/s][A
Loading Images:   6%|▋         | 2/32 [00:02<00:44,  1.49s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:35,  1.23s/it][A
Loading Images:  12%|█▎        | 4/32 [00:04<00:30,  1.10s/it][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:27,  1.01s/it][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:29,  1.14s/it][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:27,  1.11s/it][A
Loading Images:  25%|██▌       | 8/32 [00:08<00:25,  1.05s/it][A
Loading Images:  28%|██▊       | 9/32 [00:11<00:33,  1.46s/it][A
Loading Images:  31%|███▏      | 10/32 [00:12<00:28,  1.30s/it][A
Loading Images:  34%|███▍      | 11/32 [00:13<00:25,  1.19s/it][A
Loading Images:  38%|███▊      | 12/32 [00:14<00:23,  1.16s/it][A
Loading Images:  41%|████      | 13/32 [00:15<00:20,  1.10s/it][A
Loading Images:  44%|████▍     | 14/32 [00:16<00:19,  1.07s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 704 examples so far.Labels size: 20136
end_idx 736



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:33,  1.08s/it][A
Loading Images:   6%|▋         | 2/32 [00:03<00:48,  1.61s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:35,  1.21s/it][A
Loading Images:  12%|█▎        | 4/32 [00:04<00:32,  1.15s/it][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:28,  1.04s/it][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:25,  1.00it/s][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:25,  1.01s/it][A
Loading Images:  25%|██▌       | 8/32 [00:08<00:24,  1.02s/it][A
Loading Images:  28%|██▊       | 9/32 [00:09<00:22,  1.01it/s][A
Loading Images:  31%|███▏      | 10/32 [00:10<00:20,  1.06it/s][A
Loading Images:  34%|███▍      | 11/32 [00:11<00:20,  1.03it/s][A
Loading Images:  38%|███▊      | 12/32 [00:12<00:19,  1.02it/s][A
Loading Images:  41%|████      | 13/32 [00:13<00:18,  1.02it/s][A
Loading Images:  44%|████▍     | 14/32 [00:14<00:17,  1.04it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 736 examples so far.Labels size: 20136


Extracting Hidden States:   4%|▎         | 23/630 [02:38<4:23:01, 26.00s/it]

end_idx 768



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.10it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:24,  1.20it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.08it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.08it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:28,  1.04s/it][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:26,  1.01s/it][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.01it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:24,  1.00s/it][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.03it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.03it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.05it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.05it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.04it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 768 examples so far.Labels size: 20136
end_idx 800



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.07it/s][A
Loading Images:   6%|▋         | 2/32 [00:02<00:30,  1.02s/it][A
Loading Images:   9%|▉         | 3/32 [00:02<00:28,  1.03it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.12it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.10it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.04it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.05it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.07it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.03it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.04it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.04it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.01it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.04it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.07it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 800 examples so far.Labels size: 20136
end_idx 832



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:27,  1.11it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.07it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.05it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.04it/s][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:34,  1.28s/it][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:33,  1.29s/it][A
Loading Images:  22%|██▏       | 7/32 [00:08<00:31,  1.26s/it][A
Loading Images:  25%|██▌       | 8/32 [00:09<00:28,  1.18s/it][A
Loading Images:  28%|██▊       | 9/32 [00:10<00:25,  1.10s/it][A
Loading Images:  31%|███▏      | 10/32 [00:11<00:23,  1.07s/it][A
Loading Images:  34%|███▍      | 11/32 [00:12<00:21,  1.04s/it][A
Loading Images:  38%|███▊      | 12/32 [00:13<00:20,  1.02s/it][A
Loading Images:  41%|████      | 13/32 [00:14<00:20,  1.06s/it][A
Loading Images:  44%|████▍     | 14/32 [00:15<00:18,  1.02s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 832 examples so far.Labels size: 20136
end_idx 864



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:26,  1.17it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:29,  1.02it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:28,  1.01it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.03it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:26,  1.03it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.04it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.03it/s][A




Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.07it/s][A
Loading Images:  31%|███▏      | 10/32 [00:10<00:26,  1.22s/it][A
Loading Images:  34%|███▍      | 11/32 [00:11<00:23,  1.12s/it][A
Loading Images:  38%|███▊      | 12/32 [00:12<00:21,  1.07s/it][A
Loading Images:  41%|████      | 13/32 [00:13<00:19,  1.02s/it][A
Loading Images:  44%|████▍     | 14/32 [00:14<00:18,  1.00s/it][A
Loading Images:  47%|████▋     | 15/32 [00:15<00:16,  1.00it/s][A
Loading Images:  50%|█████     | 16/32 [00:15<00:15,  1.04it/s][A
Loading Images:  53%|█████▎    | 17/32 [00:16<00:14,  1.04it/s][A
Loading Images:  56%|█████▋    | 18/32 [00:17<00:13,  1.06it/s][A
Loading Images:  59%|█████▉    | 19/32 [00:18<00:12,  1.06it/s][A
Loading Images:  62%|██████▎   | 20/32 [00:19<00:11,  1.05it/s][A
Loading Images:  66%|██████▌   | 21/32 [00:20<00:10,  1.05it/s][A
Loading Images:  69%|██████▉   | 22/32 [00:21<00:08,  1.16it/s]

Dropped 1 corrupted images. New DataFrame size: (20135, 5)
clip_error False
text embeddings have shape: torch.Size([31, 512]) and img embeddings have shape: torch.Size([31, 512])
combined hidden states now has shape: torch.Size([31, 2, 512])


Extracting Hidden States:   4%|▍         | 27/630 [04:45<5:05:17, 30.38s/it]

Processed 863 examples so far.Labels size: 20135
end_idx 896



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:26,  1.16it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.07it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.08it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.08it/s][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:37,  1.37s/it][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:32,  1.26s/it][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:26,  1.07s/it][A
Loading Images:  25%|██▌       | 8/32 [00:08<00:24,  1.01s/it][A
Loading Images:  28%|██▊       | 9/32 [00:09<00:22,  1.01it/s][A
Loading Images:  31%|███▏      | 10/32 [00:10<00:20,  1.07it/s][A
Loading Images:  34%|███▍      | 11/32 [00:11<00:21,  1.05s/it][A
Loading Images:  38%|███▊      | 12/32 [00:12<00:20,  1.04s/it][A
Loading Images:  41%|████      | 13/32 [00:13<00:19,  1.01s/it][A
Loading Images:  44%|████▍     | 14/32 [00:14<00:17,  1.03it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 895 examples so far.Labels size: 20135
end_idx 928



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.10it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.04it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.07it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.12it/s][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:31,  1.17s/it][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:28,  1.09s/it][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:26,  1.05s/it][A
Loading Images:  25%|██▌       | 8/32 [00:08<00:24,  1.00s/it][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.05it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.05it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.04it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.02it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.10it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.11it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 927 examples so far.Labels size: 20135
end_idx 960



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.05it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:25,  1.20it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.11it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:23,  1.17it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:23,  1.17it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.10it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.13it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.12it/s][A
Loading Images:  28%|██▊       | 9/32 [00:07<00:20,  1.12it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.06it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:21,  1.03s/it][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:20,  1.00s/it][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.04it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.05it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 959 examples so far.Labels size: 20135


Extracting Hidden States:   5%|▍         | 30/630 [06:22<5:14:06, 31.41s/it]

end_idx 992



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.07it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.08it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.08it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.09it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:23,  1.13it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:21,  1.19it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:21,  1.19it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.13it/s][A
Loading Images:  28%|██▊       | 9/32 [00:07<00:20,  1.15it/s][A
Loading Images:  31%|███▏      | 10/32 [00:08<00:20,  1.09it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:19,  1.06it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.10it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.07it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 991 examples so far.Labels size: 20135
end_idx 1024



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:26,  1.19it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:26,  1.11it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.11it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.04it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.08it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.11it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.09it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.07it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.01it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.04it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.05it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.06it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.02it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.04it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1023 examples so far.Labels size: 20135
end_idx 1056



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.08it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.07it/s][A
Loading Images:   9%|▉         | 3/32 [00:03<00:31,  1.07s/it][A
Loading Images:  12%|█▎        | 4/32 [00:04<00:29,  1.04s/it][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:26,  1.00it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.01it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.10it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:20,  1.19it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:19,  1.18it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:19,  1.15it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.14it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.09it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.03it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.02it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1055 examples so far.Labels size: 20135
end_idx 1088



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.10it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.04it/s][A
Loading Images:   9%|▉         | 3/32 [00:03<00:30,  1.04s/it][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.01it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:26,  1.03it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.04it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.04it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.05it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:19,  1.16it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.09it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.06it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.10it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.10it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1087 examples so far.Labels size: 20135
end_idx 1120



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:39,  1.26s/it][A
Loading Images:   6%|▋         | 2/32 [00:02<00:32,  1.07s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:29,  1.03s/it][A
Loading Images:  12%|█▎        | 4/32 [00:04<00:28,  1.04s/it][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:27,  1.00s/it][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:25,  1.01it/s][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:24,  1.02it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.07it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:20,  1.10it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:18,  1.17it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.14it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:17,  1.15it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.11it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1119 examples so far.Labels size: 20135
end_idx 1152



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:27,  1.14it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:22,  1.32it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:24,  1.19it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.13it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:26,  1.02it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.10it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.07it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.07it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:19,  1.10it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.04it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.07it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.08it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1151 examples so far.Labels size: 20135
end_idx 1184



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:30,  1.02it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.05it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:25,  1.12it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.10it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.09it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.03it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.12it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.09it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:19,  1.12it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.11it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:17,  1.13it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.06it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.08it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1183 examples so far.Labels size: 20135
end_idx 1216



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:26,  1.19it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.08it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:25,  1.13it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.03it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.04it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.02it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.03it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.08it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.09it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.12it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:17,  1.11it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.11it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.12it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1215 examples so far.Labels size: 20135
end_idx 1248



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.06it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.06it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:25,  1.12it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.08it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.11it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.12it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.09it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.05it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.04it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.05it/s][A
Loading Images:  34%|███▍      | 11/32 [00:11<00:25,  1.21s/it][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:22,  1.10s/it][A
Loading Images:  41%|████      | 13/32 [00:12<00:19,  1.03s/it][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.02it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1247 examples so far.Labels size: 20135
end_idx 1280



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:33,  1.08s/it][A
Loading Images:   6%|▋         | 2/32 [00:02<00:32,  1.08s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:29,  1.02s/it][A
Loading Images:  12%|█▎        | 4/32 [00:04<00:28,  1.02s/it][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:26,  1.02it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.09it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.07it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.07it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.05it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:17,  1.12it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.10it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.08it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1279 examples so far.Labels size: 20135
end_idx 1312



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:27,  1.13it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.09it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.06it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.01it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:26,  1.01it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.03it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.05it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.05it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.06it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.07it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.08it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.10it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.07it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1311 examples so far.Labels size: 20135
end_idx 1344



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.07it/s][A
Loading Images:   6%|▋         | 2/32 [00:02<00:32,  1.08s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:30,  1.04s/it][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.02it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.07it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.03it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.06it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.06it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.05it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.08it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1343 examples so far.Labels size: 20135
end_idx 1376



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:30,  1.03it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.06it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.07it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.11it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.09it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.04it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.03it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:23,  1.01it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.03it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.03it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.06it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.08it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.09it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:15,  1.18it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1375 examples so far.Labels size: 20135
end_idx 1408



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:21,  1.43it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:24,  1.23it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:24,  1.17it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:22,  1.27it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:23,  1.17it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:22,  1.14it/s][A
Loading Images:  22%|██▏       | 7/32 [00:05<00:20,  1.20it/s][A
Loading Images:  25%|██▌       | 8/32 [00:06<00:20,  1.16it/s][A
Loading Images:  28%|██▊       | 9/32 [00:07<00:20,  1.12it/s][A
Loading Images:  31%|███▏      | 10/32 [00:08<00:20,  1.10it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:19,  1.10it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:18,  1.09it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.09it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1407 examples so far.Labels size: 20135


Extracting Hidden States:   7%|▋         | 44/630 [13:38<5:00:56, 30.81s/it]

end_idx 1440



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:44,  1.42s/it][A
Loading Images:   6%|▋         | 2/32 [00:03<00:58,  1.94s/it][A
Loading Images:   9%|▉         | 3/32 [00:04<00:41,  1.43s/it][A
Loading Images:  12%|█▎        | 4/32 [00:06<00:42,  1.52s/it][A
Loading Images:  16%|█▌        | 5/32 [00:07<00:34,  1.27s/it][A
Loading Images:  19%|█▉        | 6/32 [00:07<00:29,  1.14s/it][A
Loading Images:  22%|██▏       | 7/32 [00:08<00:26,  1.05s/it][A
Loading Images:  25%|██▌       | 8/32 [00:09<00:25,  1.04s/it][A
Loading Images:  28%|██▊       | 9/32 [00:10<00:23,  1.02s/it][A
Loading Images:  31%|███▏      | 10/32 [00:11<00:21,  1.01it/s][A
Loading Images:  34%|███▍      | 11/32 [00:12<00:20,  1.02it/s][A
Loading Images:  38%|███▊      | 12/32 [00:14<00:22,  1.14s/it][A
Loading Images:  41%|████      | 13/32 [00:14<00:19,  1.04s/it][A
Loading Images:  44%|████▍     | 14/32 [00:16<00:19,  1.10s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1439 examples so far.Labels size: 20135
end_idx 1472



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:39,  1.27s/it][A
Loading Images:   6%|▋         | 2/32 [00:02<00:30,  1.03s/it][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.06it/s][A
Loading Images:  12%|█▎        | 4/32 [00:04<00:27,  1.01it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:26,  1.04it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.09it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.07it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:23,  1.03it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.08it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.07it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.11it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.08it/s][A
Loading Images:  41%|████      | 13/32 [00:13<00:24,  1.29s/it][A
Loading Images:  44%|████▍     | 14/32 [00:14<00:21,  1.18s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1471 examples so far.Labels size: 20135
end_idx 1504



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.08it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.05it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:28,  1.03it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.03it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.08it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.09it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.02it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:23,  1.04it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.01it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.05it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.05it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.02it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.06it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.07it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1503 examples so far.Labels size: 20135
end_idx 1536



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:26,  1.17it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:25,  1.19it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.08it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.07it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.07it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.11it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.06it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.06it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.06it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.10it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.05it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:19,  1.05s/it][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:18,  1.05s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1535 examples so far.Labels size: 20135


Extracting Hidden States:   8%|▊         | 48/630 [15:45<5:04:30, 31.39s/it]

end_idx 1568



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:22,  1.36it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:24,  1.23it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:23,  1.22it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.13it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:26,  1.03it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.03it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.05it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.05it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.05it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.06it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.07it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.06it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1567 examples so far.Labels size: 20135
end_idx 1600



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:30,  1.02it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.05it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.07it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.03it/s][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:28,  1.06s/it][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:26,  1.02s/it][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:27,  1.10s/it][A
Loading Images:  25%|██▌       | 8/32 [00:08<00:24,  1.01s/it][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.02it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.07it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.11it/s][A
Loading Images:  38%|███▊      | 12/32 [00:12<00:20,  1.04s/it][A
Loading Images:  41%|████      | 13/32 [00:12<00:19,  1.02s/it][A
Loading Images:  44%|████▍     | 14/32 [00:14<00:21,  1.22s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1599 examples so far.Labels size: 20135
end_idx 1632



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:30,  1.02it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.11it/s][A
Loading Images:   9%|▉         | 3/32 [00:05<01:00,  2.07s/it][A
Loading Images:  12%|█▎        | 4/32 [00:06<00:45,  1.61s/it][A
Loading Images:  16%|█▌        | 5/32 [00:07<00:36,  1.36s/it][A
Loading Images:  19%|█▉        | 6/32 [00:08<00:32,  1.26s/it][A
Loading Images:  22%|██▏       | 7/32 [00:09<00:28,  1.12s/it][A
Loading Images:  25%|██▌       | 8/32 [00:09<00:25,  1.06s/it][A
Loading Images:  28%|██▊       | 9/32 [00:10<00:22,  1.00it/s][A
Loading Images:  31%|███▏      | 10/32 [00:12<00:26,  1.19s/it][A
Loading Images:  34%|███▍      | 11/32 [00:13<00:23,  1.12s/it][A
Loading Images:  38%|███▊      | 12/32 [00:14<00:21,  1.06s/it][A
Loading Images:  41%|████      | 13/32 [00:15<00:19,  1.03s/it][A
Loading Images:  44%|████▍     | 14/32 [00:17<00:23,  1.32s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1631 examples so far.Labels size: 20135
end_idx 1664



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.11it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:29,  1.03it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:28,  1.01it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.03it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:26,  1.01it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.04it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.02it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:23,  1.00it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.02it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.05it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.07it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.08it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.08it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.08it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1663 examples so far.Labels size: 20135
end_idx 1696



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.05it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:26,  1.14it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:25,  1.13it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.16it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.09it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.06it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.13it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.11it/s][A
Loading Images:  28%|██▊       | 9/32 [00:07<00:18,  1.22it/s][A
Loading Images:  31%|███▏      | 10/32 [00:08<00:18,  1.20it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:18,  1.16it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:17,  1.13it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.10it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1695 examples so far.Labels size: 20135
end_idx 1728



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.07it/s][A
Loading Images:   6%|▋         | 2/32 [00:02<00:31,  1.05s/it][A
Loading Images:   9%|▉         | 3/32 [00:02<00:28,  1.01it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.05it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.09it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.12it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.01it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:23,  1.02it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.02it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.05it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.06it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.05it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.04it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.04it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1727 examples so far.Labels size: 20135
end_idx 1760



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:34,  1.13s/it][A
Loading Images:   6%|▋         | 2/32 [00:02<00:31,  1.04s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:29,  1.00s/it][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.03it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.05it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.05it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.05it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.08it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.07it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:19,  1.10it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.10it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:17,  1.12it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.08it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.08it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1759 examples so far.Labels size: 20135
end_idx 1792



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.09it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:29,  1.02it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.10it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.13it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:27,  1.01s/it][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:32,  1.23s/it][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:28,  1.15s/it][A




Loading Images:  25%|██▌       | 8/32 [00:08<00:26,  1.09s/it][A
Loading Images:  28%|██▊       | 9/32 [00:09<00:23,  1.04s/it][A
Loading Images:  31%|███▏      | 10/32 [00:10<00:20,  1.07it/s][A
Loading Images:  34%|███▍      | 11/32 [00:11<00:20,  1.04it/s][A
Loading Images:  38%|███▊      | 12/32 [00:12<00:18,  1.06it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.06it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.10it/s][A
Loading Images:  47%|████▋     | 15/32 [00:14<00:15,  1.13it/s][A
Loading Images:  50%|█████     | 16/32 [00:15<00:14,  1.10it/s][A
Loading Images:  53%|█████▎    | 17/32 [00:17<00:16,  1.11s/it][A
Loading Images:  56%|█████▋    | 18/32 [00:18<00:14,  1.04s/it][A
Loading Images:  59%|█████▉    | 19/32 [00:18<00:12,  1.07it/s][A
Loading Images:  62%|██████▎   | 20/32 [00:19<00:12,  1.02s/it][A
Loading Images:  66%|██████▌   | 21/32 [00:21<00:13,  1.24s/it][A
Loading Images:  69%|██████▉   | 22/32 [00:22<00:11,  1.17s/it]

Dropped 1 corrupted images. New DataFrame size: (20134, 5)
clip_error False
text embeddings have shape: torch.Size([31, 512]) and img embeddings have shape: torch.Size([31, 512])
combined hidden states now has shape: torch.Size([31, 2, 512])


Extracting Hidden States:   9%|▉         | 56/630 [20:03<5:10:27, 32.45s/it]

Processed 1790 examples so far.Labels size: 20134
end_idx 1824



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:32,  1.03s/it][A
Loading Images:   6%|▋         | 2/32 [00:03<00:50,  1.67s/it][A
Loading Images:   9%|▉         | 3/32 [00:04<00:39,  1.36s/it][A
Loading Images:  12%|█▎        | 4/32 [00:05<00:35,  1.28s/it][A
Loading Images:  16%|█▌        | 5/32 [00:06<00:31,  1.17s/it][A
Loading Images:  19%|█▉        | 6/32 [00:07<00:27,  1.07s/it][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:23,  1.04it/s][A
Loading Images:  25%|██▌       | 8/32 [00:08<00:22,  1.06it/s][A
Loading Images:  28%|██▊       | 9/32 [00:09<00:22,  1.03it/s][A
Loading Images:  31%|███▏      | 10/32 [00:10<00:21,  1.01it/s][A
Loading Images:  34%|███▍      | 11/32 [00:11<00:20,  1.05it/s][A
Loading Images:  38%|███▊      | 12/32 [00:12<00:18,  1.05it/s][A
Loading Images:  41%|████      | 13/32 [00:13<00:17,  1.08it/s][A
Loading Images:  44%|████▍     | 14/32 [00:14<00:16,  1.12it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1822 examples so far.Labels size: 20134


Extracting Hidden States:   9%|▉         | 57/630 [20:34<5:06:21, 32.08s/it]

end_idx 1856



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.05it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.05it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.04it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.06it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.05it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.06it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.05it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.08it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.08it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.08it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.06it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.05it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1854 examples so far.Labels size: 20134
end_idx 1888



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:31,  1.02s/it][A
Loading Images:   6%|▋         | 2/32 [00:01<00:29,  1.03it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:28,  1.03it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.05it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.07it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.08it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.07it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.11it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:20,  1.15it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:19,  1.13it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:18,  1.15it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:17,  1.16it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:16,  1.18it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:15,  1.16it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1886 examples so far.Labels size: 20134
end_idx 1920



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:30,  1.03it/s][A
Loading Images:   6%|▋         | 2/32 [00:02<00:30,  1.03s/it][A
Loading Images:   9%|▉         | 3/32 [00:02<00:28,  1.02it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.04it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.10it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.05it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.05it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.05it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.05it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:19,  1.15it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.12it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.10it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:16,  1.12it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.10it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1918 examples so far.Labels size: 20134


Extracting Hidden States:  10%|▉         | 60/630 [22:04<4:52:19, 30.77s/it]

end_idx 1952



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:30,  1.03it/s][A
Loading Images:   6%|▋         | 2/32 [00:02<00:36,  1.20s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:27,  1.04it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.04it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.05it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.08it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.06it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.06it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:18,  1.18it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.11it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:17,  1.12it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:16,  1.14it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:15,  1.15it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1950 examples so far.Labels size: 20134
end_idx 1984



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:27,  1.12it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.07it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.06it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:23,  1.20it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:23,  1.14it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.12it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.11it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.08it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:18,  1.12it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:18,  1.06it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.09it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.11it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 1982 examples so far.Labels size: 20134


Extracting Hidden States:  10%|▉         | 62/630 [23:06<4:50:44, 30.71s/it]

end_idx 2016



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.06it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.06it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.07it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.12it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:22,  1.19it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.02it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.04it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:23,  1.02it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.04it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.05it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.00it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.06it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.09it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.11it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2014 examples so far.Labels size: 20134
end_idx 2048



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:39,  1.26s/it][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.07it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:24,  1.17it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:23,  1.18it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:22,  1.19it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:22,  1.16it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.12it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.05it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.05it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.06it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.06it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.06it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.05it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2046 examples so far.Labels size: 20134
end_idx 2080



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.04it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:26,  1.13it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.10it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.12it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.11it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:26,  1.07s/it][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:24,  1.03s/it][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.02it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.02it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.03it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.05it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.06it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.02it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2078 examples so far.Labels size: 20134


Extracting Hidden States:  10%|█         | 65/630 [24:39<4:52:17, 31.04s/it]

end_idx 2112



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:22,  1.37it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:24,  1.21it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.09it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:23,  1.17it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:23,  1.14it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:22,  1.16it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.12it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.12it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.06it/s][A
Loading Images:  31%|███▏      | 10/32 [00:08<00:19,  1.11it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:18,  1.13it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:17,  1.13it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.11it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:15,  1.15it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2110 examples so far.Labels size: 20134
end_idx 2144



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:30,  1.03it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:26,  1.13it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:24,  1.19it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:23,  1.20it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.09it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.10it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.11it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:20,  1.13it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.02it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.01it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.04it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.02it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.02it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2142 examples so far.Labels size: 20134
end_idx 2176



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:26,  1.18it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:25,  1.15it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.11it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.07it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.09it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.10it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.11it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.03it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.05it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.05it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.07it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.11it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2174 examples so far.Labels size: 20134
end_idx 2208



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:20,  1.48it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:24,  1.23it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:25,  1.14it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.15it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:23,  1.16it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:22,  1.17it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.09it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.08it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.07it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:24,  1.11s/it][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:23,  1.12s/it][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:21,  1.06s/it][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.00it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.06it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2206 examples so far.Labels size: 20134
end_idx 2240



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:31,  1.01s/it][A
Loading Images:   6%|▋         | 2/32 [00:02<00:31,  1.04s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:30,  1.05s/it][A
Loading Images:  12%|█▎        | 4/32 [00:04<00:28,  1.01s/it][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:31,  1.16s/it][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:28,  1.08s/it][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:25,  1.03s/it][A
Loading Images:  25%|██▌       | 8/32 [00:08<00:28,  1.20s/it][A
Loading Images:  28%|██▊       | 9/32 [00:09<00:25,  1.12s/it][A
Loading Images:  31%|███▏      | 10/32 [00:11<00:25,  1.15s/it][A
Loading Images:  34%|███▍      | 11/32 [00:12<00:22,  1.08s/it][A
Loading Images:  38%|███▊      | 12/32 [00:13<00:21,  1.07s/it][A
Loading Images:  41%|████      | 13/32 [00:14<00:23,  1.24s/it][A
Loading Images:  44%|████▍     | 14/32 [00:15<00:20,  1.16s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2238 examples so far.Labels size: 20134
end_idx 2272



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:27,  1.13it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.07it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.07it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.07it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.07it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:22,  1.18it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.13it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:20,  1.15it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:20,  1.13it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:18,  1.11it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:18,  1.10it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.12it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:15,  1.14it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2270 examples so far.Labels size: 20134
end_idx 2304



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.11it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.04it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:24,  1.17it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.09it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.06it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.06it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.07it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:23,  1.02it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.04it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.08it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.06it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.06it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2302 examples so far.Labels size: 20134
end_idx 2336



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.09it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.07it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.06it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.05it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.05it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.07it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.10it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:20,  1.11it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.04it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.12it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.10it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.09it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.08it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2334 examples so far.Labels size: 20134
end_idx 2368



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.06it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.11it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.10it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.06it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.07it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.10it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.10it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.08it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.09it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.07it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.10it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2366 examples so far.Labels size: 20134
end_idx 2400



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:31,  1.01s/it][A
Loading Images:   6%|▋         | 2/32 [00:02<00:30,  1.01s/it][A
Loading Images:   9%|▉         | 3/32 [00:02<00:28,  1.02it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:27,  1.02it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.04it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.09it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.10it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.03it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.07it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.10it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.10it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2398 examples so far.Labels size: 20134
end_idx 2432



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:32,  1.04s/it][A
Loading Images:   6%|▋         | 2/32 [00:02<00:35,  1.17s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:30,  1.06s/it][A
Loading Images:  12%|█▎        | 4/32 [00:04<00:28,  1.01s/it][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:26,  1.02it/s][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:25,  1.02it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.04it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.07it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.09it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.04it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.08it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.08it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.08it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.08it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2430 examples so far.Labels size: 20134
end_idx 2464



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:27,  1.15it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:25,  1.16it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:25,  1.12it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.10it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:28,  1.04s/it][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:26,  1.00s/it][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.04it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.07it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.07it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.06it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.06it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.11it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2462 examples so far.Labels size: 20134


Extracting Hidden States:  12%|█▏        | 77/630 [30:49<4:42:30, 30.65s/it]

end_idx 2496



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:27,  1.13it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:25,  1.17it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:24,  1.19it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.06it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:26,  1.03it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.02it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.06it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.06it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.06it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.04it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.06it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.02it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.04it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.05it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2494 examples so far.Labels size: 20134


Extracting Hidden States:  12%|█▏        | 78/630 [31:20<4:43:58, 30.87s/it]

end_idx 2528



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.07it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.07it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.07it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.10it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.08it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.11it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.09it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.05it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.07it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:18,  1.14it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:17,  1.15it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.12it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:15,  1.17it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2526 examples so far.Labels size: 20134
end_idx 2560



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.10it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.10it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.09it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.08it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.04it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:26,  1.01s/it][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.07it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.10it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.07it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.07it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.01it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.03it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.03it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2558 examples so far.Labels size: 20134
end_idx 2592



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:26,  1.15it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:25,  1.16it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:24,  1.17it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.10it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.08it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.08it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.08it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.07it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.09it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.09it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:18,  1.12it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:18,  1.10it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.10it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.08it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2590 examples so far.Labels size: 20134


Extracting Hidden States:  13%|█▎        | 81/630 [33:36<6:41:35, 43.89s/it]

end_idx 2624



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:31,  1.01s/it][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.04it/s][A
Loading Images:   9%|▉         | 3/32 [00:03<00:29,  1.03s/it][A
Loading Images:  12%|█▎        | 4/32 [00:04<00:28,  1.03s/it][A
Loading Images:  16%|█▌        | 5/32 [00:05<00:27,  1.02s/it][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:25,  1.02it/s][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:25,  1.00s/it][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:23,  1.02it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.05it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.04it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.05it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:19,  1.05it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.07it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.06it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2622 examples so far.Labels size: 20134
end_idx 2656



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:23,  1.31it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:25,  1.16it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:25,  1.13it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.14it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.11it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.10it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.09it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.08it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.06it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.07it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:18,  1.08it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:16,  1.16it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:14,  1.22it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2654 examples so far.Labels size: 20134
end_idx 2688



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:19,  1.55it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:24,  1.24it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:25,  1.14it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.16it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:23,  1.13it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.12it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.11it/s][A
Loading Images:  25%|██▌       | 8/32 [00:06<00:21,  1.13it/s][A
Loading Images:  28%|██▊       | 9/32 [00:07<00:20,  1.11it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.01it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.03it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:18,  1.11it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.08it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.07it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2686 examples so far.Labels size: 20134
end_idx 2720



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:32,  1.05s/it][A
Loading Images:   6%|▋         | 2/32 [00:02<00:29,  1.01it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:24,  1.17it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:24,  1.14it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.11it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.10it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.05it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.09it/s][A
Loading Images:  31%|███▏      | 10/32 [00:08<00:18,  1.19it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:17,  1.17it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:18,  1.04it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:17,  1.05it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2718 examples so far.Labels size: 20134
end_idx 2752



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.08it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.07it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:25,  1.14it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.10it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:23,  1.14it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:22,  1.16it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:21,  1.14it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.09it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.09it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:20,  1.08it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:19,  1.10it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:17,  1.12it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:16,  1.14it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.07it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2750 examples so far.Labels size: 20134
end_idx 2784



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:28,  1.08it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:28,  1.06it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.05it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.07it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.07it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:22,  1.15it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:22,  1.14it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:21,  1.11it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:20,  1.12it/s][A
Loading Images:  31%|███▏      | 10/32 [00:08<00:18,  1.17it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:19,  1.07it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:18,  1.10it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.09it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.10it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2782 examples so far.Labels size: 20134
end_idx 2816



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:29,  1.05it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.11it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.11it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.09it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.09it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.04it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:24,  1.04it/s][A
Loading Images:  25%|██▌       | 8/32 [00:09<00:40,  1.71s/it][A
Loading Images:  28%|██▊       | 9/32 [00:10<00:33,  1.46s/it][A
Loading Images:  31%|███▏      | 10/32 [00:11<00:28,  1.29s/it][A
Loading Images:  34%|███▍      | 11/32 [00:12<00:24,  1.19s/it][A
Loading Images:  38%|███▊      | 12/32 [00:13<00:22,  1.11s/it][A
Loading Images:  41%|████      | 13/32 [00:14<00:20,  1.06s/it][A
Loading Images:  44%|████▍     | 14/32 [00:15<00:19,  1.08s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2814 examples so far.Labels size: 20134
end_idx 2848



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:32,  1.04s/it][A
Loading Images:   6%|▋         | 2/32 [00:01<00:27,  1.10it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.08it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.04it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:24,  1.12it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.07it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:25,  1.02s/it][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:22,  1.05it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:21,  1.05it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.02it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.04it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.07it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:18,  1.03it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:17,  1.03it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2846 examples so far.Labels size: 20134
end_idx 2880



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:01<00:33,  1.07s/it][A
Loading Images:   6%|▋         | 2/32 [00:02<00:32,  1.08s/it][A
Loading Images:   9%|▉         | 3/32 [00:03<00:29,  1.01s/it][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.07it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.08it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:23,  1.10it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:20,  1.21it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:20,  1.18it/s][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:19,  1.15it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:19,  1.12it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:18,  1.13it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:17,  1.15it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:17,  1.11it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.10it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2878 examples so far.Labels size: 20134
end_idx 2912



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:20,  1.50it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:24,  1.23it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:22,  1.28it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:22,  1.25it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:22,  1.20it/s][A
Loading Images:  19%|█▉        | 6/32 [00:04<00:22,  1.18it/s][A
Loading Images:  22%|██▏       | 7/32 [00:05<00:21,  1.16it/s][A
Loading Images:  25%|██▌       | 8/32 [00:06<00:21,  1.14it/s][A
Loading Images:  28%|██▊       | 9/32 [00:07<00:20,  1.11it/s][A
Loading Images:  31%|███▏      | 10/32 [00:08<00:18,  1.20it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:17,  1.17it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:17,  1.14it/s][A
Loading Images:  41%|████      | 13/32 [00:11<00:16,  1.12it/s][A
Loading Images:  44%|████▍     | 14/32 [00:12<00:16,  1.09it/s][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2910 examples so far.Labels size: 20134
end_idx 2944



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:25,  1.23it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:26,  1.11it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:26,  1.09it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:25,  1.10it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.06it/s][A
Loading Images:  19%|█▉        | 6/32 [00:05<00:24,  1.08it/s][A
Loading Images:  22%|██▏       | 7/32 [00:06<00:23,  1.08it/s][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:19,  1.21it/s][A
Loading Images:  28%|██▊       | 9/32 [00:07<00:18,  1.24it/s][A
Loading Images:  31%|███▏      | 10/32 [00:08<00:18,  1.17it/s][A
Loading Images:  34%|███▍      | 11/32 [00:09<00:18,  1.15it/s][A
Loading Images:  38%|███▊      | 12/32 [00:10<00:17,  1.16it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:21,  1.15s/it][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:19,  1.08s/it][A
Loading Imag

clip_error True
text embeddings have shape: torch.Size([32, 512]) and img embeddings have shape: torch.Size([32, 512])
combined hidden states now has shape: torch.Size([32, 2, 512])
Processed 2942 examples so far.Labels size: 20134
end_idx 2976



Loading Images:   0%|          | 0/32 [00:00<?, ?it/s][A
Loading Images:   3%|▎         | 1/32 [00:00<00:30,  1.02it/s][A
Loading Images:   6%|▋         | 2/32 [00:01<00:29,  1.03it/s][A
Loading Images:   9%|▉         | 3/32 [00:02<00:27,  1.04it/s][A
Loading Images:  12%|█▎        | 4/32 [00:03<00:26,  1.07it/s][A
Loading Images:  16%|█▌        | 5/32 [00:04<00:25,  1.07it/s][A
Loading Images:  19%|█▉        | 6/32 [00:06<00:28,  1.10s/it][A
Loading Images:  22%|██▏       | 7/32 [00:07<00:26,  1.05s/it][A
Loading Images:  25%|██▌       | 8/32 [00:07<00:24,  1.00s/it][A
Loading Images:  28%|██▊       | 9/32 [00:08<00:22,  1.04it/s][A
Loading Images:  31%|███▏      | 10/32 [00:09<00:21,  1.03it/s][A
Loading Images:  34%|███▍      | 11/32 [00:10<00:20,  1.05it/s][A
Loading Images:  38%|███▊      | 12/32 [00:11<00:18,  1.06it/s][A
Loading Images:  41%|████      | 13/32 [00:12<00:17,  1.06it/s][A
Loading Images:  44%|████▍     | 14/32 [00:13<00:16,  1.08it/s][A
Loading Imag

## Initialize Dataloader

In [None]:
def process_labels(labels):
  """ Goes through all labels and converts from a list of strings to a pytorch tensor, 0 if negative, 1 is positive
  """
  label_map = {"negative": 0, "positive": 1}
  encoded_labels = [label_map[label] for label in labels]
  return torch.tensor(encoded_labels, dtype=torch.float)

In [None]:
def extract_cls_emeddings(encoder_emeddings):
  text_cls_emeddings = encoder_emeddings[:, 0,:]
  image_cls_emeddings = encoder_emeddings[:, 77,:]
  return text_cls_emeddings, image_cls_emeddings

In [None]:
from torch.utils.data import random_split, TensorDataset, DataLoader, Dataset

class MultiModalDataset(Dataset):
  def __init__(self, embeddings, sentiment_labels):
    self.embeddings = embeddings
    self.labels = sentiment_labels


  def __len__(self):
    """ Returns the number of samples in the dataset.
    """
    return int(len(self.labels))

  def __getitem__(self, idx):
    try:
      embedding = self.embeddings[idx]
      label = self.labels[idx]
      return embedding, label
    except Exception as e:
      print(f"Can't get sample")
      random_idx = torch.randint(0, len(self.embeddings), (1,)).item()
      return self.__getitem__(random_idx)

 \\

\\

\\

\\

\\

#Start Here

Load Data from saved

In [None]:
subset_test = False

In [None]:
if subset_test:
  embeddings_path = '/content/drive/MyDrive/MASAD_pivot/mme_1k.pt'
  labels_path = '/content/drive/MyDrive/MASAD_pivot/labels_1k.pt'
  print("Running on subset for testing.")
else:
  embeddings_path = '/content/drive/MyDrive/MASAD_pivot/mme_20k.pt'
  labels_path = '/content/drive/MyDrive/MASAD_pivot/labels_20k.pt'
  print("Running on full, balanced data for final results.")

Running on full, balanced data for final results.


Torch.load: https://pytorch.org/docs/stable/generated/torch.load.html \\
random_split: https://discuss.pytorch.org/t/how-to-split-dataset-into-test-and-validation-sets/33987/3

In [None]:
embeddings = torch.load(embeddings_path)
labels = torch.load(labels_path)

print(embeddings.size())
print(len(labels))

  embeddings = torch.load(embeddings_path)
  labels = torch.load(labels_path)


torch.Size([19982, 2, 512])
19982


Init dataset object for dataloader and get seeded random split

In [None]:
# Using generator seed for consistency
generator = torch.Generator().manual_seed(42)

# Init dataset class for dataloaders later
dataset = MultiModalDataset(embeddings, labels)

split = 0.8

# get splits
train_split = int(split * dataset.__len__())
test_split = int(dataset.__len__() - train_split)

# Debugging size issues
print("Successful split" if dataset.__len__() == (train_split + test_split) else f"Split failed: train len = {train_split}, test len = {test_split}")

# Get train and test set
train_set, test_set = torch.utils.data.random_split(dataset, [train_split, test_split], generator=generator)

Successful split


In [None]:
batch_size = 32

# DataLoaders for easy iter
train_loader = DataLoader(train_set, batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size, shuffle=False)

## Data Visualization

In [None]:
import pandas as pd
from PIL import ImageColor
df = pd.read_pickle(save_path_raw)
print(df.keys())
image_paths = df['image_path']
labels = df['label']
images = []

nvis = 4 # how many images to visualize
for i in range(nvis):
  im = Image.open(image_paths[i])
  images.append(im)

print(f'attempting to visualize {len(images)} with matplotlib')
visualize_a_few_images(images, labels[:nvis])

#Model Architecture

In [None]:
class SequenceWiseTransformer(nn.Module):
    def __init__(self, d_model=512, nhead=8, num_layers=4, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )

        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, x):
      e = self.transformer_encoder(x)
      return e

In [None]:
class KeylessAttention(nn.Module):
    def __init__(self, hidden_size):
        super(KeylessAttention, self).__init__()
        self.W = nn.Parameter(torch.empty(hidden_size))
        # Use Xavier rather than uniform for better inits
        nn.init.xavier_uniform_(self.W.unsqueeze(0))

    def forward(self, text_emb, im_emb):
        # Have pivoted to using entire sequence - level embeddings
        text_attn = torch.matmul(text_emb, self.W)
        im_attn = torch.matmul(im_emb, self.W)

        # Functional softmax is more numerically stable
        scores = torch.stack([text_attn, im_attn], dim=1)
        weights = F.softmax(scores, dim=1)
        w_text = weights[:, 0].unsqueeze(1)
        w_img = weights[:, 1].unsqueeze(1)

        # Aggregate the embeddings
        combined_emb = w_img * im_emb + w_text * text_emb
        return combined_emb

In [None]:
class ClassificationHead(nn.Module):
    def __init__(self, d_model=512, num_classes=1):
        super().__init__()
        # hidden layer size to 128, then 32, then 1
        self.linear1 = nn.Linear(d_model, 128)
        self.linear2 = nn.Linear(128, 32)
        self.linear3 = nn.Linear(32, num_classes)
        self.relu = nn.ReLU()

        # Tried normalization and it didn't really help
        # Adding dropout due to overfitting
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        pass_1 = self.linear1(x)
        basic_activation_1 = self.relu(pass_1)
        dropout_1 = self.dropout(basic_activation_1)
        pass_2 = self.linear2(dropout_1)
        basic_activation_2 = self.relu(pass_2)
        dropout_2 = self.dropout(basic_activation_2)
        x = self.linear3(dropout_2)
        return x

In [None]:
class FullModel(nn.Module):
    def __init__(self, transformer, keyless_attention, classifier):
        super(FullModel, self).__init__()
        self.transformer = transformer
        self.keyless_attention = keyless_attention
        self.classifier = classifier

    def forward(self, x):
        encoded_embeddings = self.transformer(x) # Should be size [B, 2, D]
        combined_emb = self.keyless_attention(encoded_embeddings[:, 0, :], encoded_embeddings[:, 1, :])
        logits = self.classifier(combined_emb)
        return logits

#Train & Test Functions

Understanding BCEWithLogitsLoss vs CrossEntropyLoss: https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-pytorch-loss-functions.md

"This version is more numerically stable than using a plain Sigmoid followed by a BCELoss as, by combining the operations into one layer, we take advantage of the log-sum-exp trick for numerical stability."

In [None]:
def train(model, device, train_loader, optimizer, loss_fn, epoch,writer):
    model.train()
    running_loss = 0.0

    # Use tqdm for progress
    pbar = tqdm(train_loader, desc=f"Epoch {epoch} Training", leave=False)

    # Do not detach
    for batch_idx, (data, label) in enumerate(pbar):
        data, label = data.to(device), label.to(device).unsqueeze(1).float()

        # Forward Pass
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, label)

        # Backward Pass & Optimize
        loss.backward()
        optimizer.step()

        # Increment loss and update pbar for tqdm
        running_loss += loss.item()
        pbar.set_postfix({'Loss': f'{loss.item():.4f}'})

    avg_loss = running_loss / len(train_loader)
    writer.add_scalar('Loss/Train', avg_loss, epoch)


    print(f'\nEpoch {epoch} Training Loss: {avg_loss:.4f}')

In [None]:
def test(model, device, test_loader, loss_fn, epoch,writer):
    print(f'*** RUNNING TEST FUNCTION ***')
    model.eval()
    test_loss = 0
    all_preds = []
    all_labels = []

    # Bug - removed "detach()"
    with torch.no_grad():
        for batch_idx, (data, label) in enumerate(tqdm(test_loader, desc="Testing", leave=False)):
            data, label = data.to(device), label.to(device).unsqueeze(1).float()

            # Get prediction; calculate test loss & # of correct predictions
            output = model(data)
            loss = loss_fn(output, label)
            test_loss += loss.item()

            probs = torch.sigmoid(output)
            preds = (probs >= 0.5).float()

            # Had to modify extended list due to format change
            all_preds.extend(preds.cpu().numpy().flatten().tolist())
            all_labels.extend(label.cpu().numpy().flatten().tolist())


    # Use sklearn to do eval comps for us :)
    avg_loss = test_loss / len(test_loader)
    accuracy = 100. * sum([p==l for p,l in zip(all_preds, all_labels)]) / len(all_labels)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    roc = roc_auc_score(all_labels, all_preds)
    writer.add_scalar('Loss/Test', avg_loss, epoch)
    writer.add_scalar('Accuracy/Test', accuracy, epoch)


    print(f'Epoch {epoch}: Test Loss: {avg_loss:.4f}  |  Accuracy {accuracy:.2f}  |  Precision {precision:.2f}  |  Recall {recall:.2f}  |  F1 {f1:.2f}  |  F1 {f1:.2f}  |  ROC-AUC {roc:.2f}')

# Main

Init components

In [None]:
transformer = SequenceWiseTransformer()
attention = KeylessAttention(hidden_size=512)
classifier = ClassificationHead()
model = FullModel(transformer, attention, classifier)
model.to(device)

In [None]:
# Move all init params outside of train/test so they don't have to init on every train loop
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
# num_epochs = 20
# num_warmup_steps =
# total_steps = len(train_loader) *num_epochs
# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=warmup_steps,  # Number of warmup steps
#     num_training_steps=total_steps  # Total number of training steps
# )

# scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir='/content/drive/My Drive/tensorboard_logs/experiment1')

num_epochs = 20

for epoch in range(1, num_epochs+1):
    print(f'beginning epoch: {epoch}')
    train(model, device, train_loader, optimizer, loss_fn, epoch,writer)
    test(model, device, test_loader, loss_fn, epoch,writer)
    # scheduler.step()
writer.close()

In [None]:
%load_ext tensorboard
%tensorboard --logdir="/content/drive/My Drive/tensorboard_logs"

#End Here

\\

\\

\\

\\

\\