In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch torchvision transformers sentencepiece

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:

!pip install multilingual-clip
!pip install git+https://github.com/openai/CLIP.git

Collecting multilingual-clip
  Downloading multilingual_clip-1.0.10-py3-none-any.whl.metadata (14 kB)
Downloading multilingual_clip-1.0.10-py3-none-any.whl (20 kB)
Installing collected packages: multilingual-clip
Successfully installed multilingual-clip-1.0.10
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-2gcio3gc
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-2gcio3gc
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torch.multiprocessing import set_start_method
from multilingual_clip import pt_multilingual_clip
import transformers
import clip
from PIL import Image
from torch.utils.data.dataloader import default_collate
from tqdm import tqdm

# Ensure using 'spawn' start method for CUDA
try:
    set_start_method('spawn')
except RuntimeError:
    pass

#dataset directory
data_dir = '/content/drive/MyDrive/Colab Notebooks/thesis/MMDravi/'

# Define paths to the images and split files
image_splits_dir = os.path.join(data_dir, 'image_splits')
images_dir = os.path.join(data_dir, 'flickr30k_images/flickr30k_images')

#read image split files
def read_image_splits(file_path):
    with open(file_path, 'r') as file:
        return [line.strip() for line in file]

#load image splits
train_images = read_image_splits(os.path.join(image_splits_dir, 'train_images.txt'))
val_images = read_image_splits(os.path.join(image_splits_dir, 'val_images.txt'))
test_images_2016 = read_image_splits(os.path.join(image_splits_dir, 'test_2016_images.txt'))
#the image files in split are not found in the flickr30k images folder. skipping this for now
#test_images_2017 = read_image_splits(os.path.join(image_splits_dir, 'test_2017_images.txt'))

#load captions
def load_captions(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

#load paths of caption files
train_captions_files = [
    os.path.join(data_dir, 'train.lc.norm.tok.1.google.ta'),
    os.path.join(data_dir, 'train.lc.norm.tok.2.google.ta'),
    os.path.join(data_dir, 'train.lc.norm.tok.3.google.ta')
]
val_captions_file = os.path.join(data_dir, 'val.lc.norm.tok.google.ta')
test_captions_file_2016 = os.path.join(data_dir, 'test_2016_flickr.lc.norm.tok.google.ta')
test_captions_file_2017 = os.path.join(data_dir, 'test_2017_flickr.lc.norm.tok.google.ta')

#load captions
train_captions = []
for file in train_captions_files:
    train_captions.extend(load_captions(file))
val_captions = load_captions(val_captions_file)
test_captions_2016 = load_captions(test_captions_file_2016)
#test_captions_2017 = load_captions(test_captions_file_2017)

#checking the lengths
print("length of train images:", len(train_images))
print("length of train captions:", len(train_captions))
print("length of val images:", len(val_images))
print("length of val captions:", len(val_captions))
print("length of test images:", len(test_images_2016))
print("length of test captions:", len(test_captions_2016))

length of train images: 29000
length of train captions: 29000
length of val images: 1014
length of val captions: 1014
length of test images: 1000
length of test captions: 1000


In [None]:
#dataset class
class MMDraviDataset(Dataset):
    def __init__(self, image_paths, captions, images_dir, preprocess, tokenizer, text_model, device):
        self.image_paths = image_paths
        self.captions = captions
        self.images_dir = images_dir
        self.preprocess = preprocess
        self.tokenizer = tokenizer
        self.text_model = text_model
        self.device = device

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        caption = self.captions[idx]

        try:
            # Process image
            image = Image.open(os.path.join(self.images_dir, image_path)).convert("RGB")
            image = self.preprocess(image).to(self.device)
            return image_path, image, caption
        except IOError as e:
            print(f"Error processing {image_path}: {e}")
            return None  # Return None for image if there is an error
        except Exception as e:
            print(f"Unexpected error processing {image_path}: {e}")
            return None  # Return None for image if there is an error

def custom_collate_fn(batch):
    # Filter out None images and log which ones are None
    valid_items = []
    for item in batch:
        if item is not None:
            valid_items.append(item)
        else:
            print("Found None item in batch")

    if len(valid_items) == 0:
        return [], [], []  # Return empty lists if the entire batch is None

    return default_collate(valid_items)

#creating data loaders
def create_data_loader(image_paths, captions, images_dir, preprocess, tokenizer, text_model, device, batch_size=64, shuffle=True):
    dataset = MMDraviDataset(image_paths, captions, images_dir, preprocess, tokenizer, text_model, device)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=custom_collate_fn)
    print(f"DataLoader created with {len(dataset)} samples")
    return data_loader

#define the device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

#load models and tokenizers
try:
    text_model_name = 'M-CLIP/LABSE-Vit-L-14'
    text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(text_model_name)
    print(f"{text_model_name} text model loaded successfuly")
    tokenizer = transformers.AutoTokenizer.from_pretrained(text_model_name)
    print("Tokenizer loaded successfully")
    image_model, preprocess = clip.load("ViT-L/14", device=device)
    print("Models and tokenizers loaded successfully")
except Exception as e:
    print(f"Error loading models or tokenizers: {e}")
    raise e

# Create data loaders
train_loader = create_data_loader(train_images, train_captions, images_dir, preprocess, tokenizer, text_model, device)
val_loader = create_data_loader(val_images, val_captions, images_dir, preprocess, tokenizer, text_model, device, shuffle=False)
test_loader_2016 = create_data_loader(test_images_2016, test_captions_2016, images_dir, preprocess, tokenizer, text_model, device, shuffle=False)
print("Data loaders created successfully.")

Using device: cuda
M-CLIP/LABSE-Vit-L-14 text model loaded successfuly
Tokenizer loaded successfully
Models and tokenizers loaded successfully
DataLoader created with 29000 samples
DataLoader created with 1014 samples
DataLoader created with 1000 samples
Data loaders created successfully.


In [None]:
#collecting and saving embeddings
def collect_and_save_embeddings(data_loader, image_model, text_model, tokenizer, device, save_path):
    all_embeddings = []
    #print("Entering loop") #print statements for debugging

    #using tqdm to create a progress bar
    for i, (image_paths, images, captions) in enumerate(tqdm(data_loader, desc="Processing batches")):
        #print("checking for valid images")
        if len(images) == 0:
          print(f"All images in batch {i + 1} are None. Skipping this batch.")
          continue

        try:
            #print(f"Processing batch {i + 1}/{len(data_loader)}")
            images = images.to(device)

            with torch.no_grad():
                try:
                    image_embeds = image_model.encode_image(images)
                    #print("Image embeddings extracted")
                except Exception as e:
                    print(f"Error extracting image embeddings in batch {i + 1}: {e}")
                    continue  #skip this batch

                try:
                    text_embeds = text_model.forward(captions, tokenizer)
                    #print("Text embeddings extracted")
                except Exception as e:
                    print(f"Error extracting text embeddings in batch {i + 1}: {e}")
                    continue  #skip this batch

            #print("Collecting embeddings with identifiers")
            #add embeddings with identifiers
            for img_path, img_emb, txt_emb, caption in zip(image_paths, image_embeds, text_embeds, captions):
                all_embeddings.append({
                    'image_path': img_path,
                    'image_embedding': img_emb.cpu(),
                    'text_embedding': txt_emb.cpu(),
                    'caption': caption
                })
            #print("Collected embeddings with identifiers for batch", i + 1)

        except Exception as e:
            print(f"Error in processing batch {i + 1}: {e}")

    # Save all embeddings to a single file
    try:
        torch.save(all_embeddings, save_path)
        print(f"Saved embeddings to {save_path}")
    except Exception as e:
        print(f"Error saving embeddings to {save_path}: {e}")

In [None]:
save_dir = '/content/drive/MyDrive/Colab Notebooks/thesis/embeddings/labse'

In [None]:
print(text_model.device)

cpu


In [None]:
collect_and_save_embeddings(train_loader, image_model, text_model, tokenizer, device, os.path.join(save_dir, 'labse_train_embeddings.pth'))

Processing batches: 100%|██████████| 454/454 [3:14:45<00:00, 25.74s/it]


Saved embeddings to /content/drive/MyDrive/Colab Notebooks/thesis/embeddings/labse/labse_train_embeddings.pth


In [None]:
import torch

#preview the first few entries
def preview_embeddings(embeddings, num_entries=5):
    print(f"Total number of embeddings: {len(embeddings)}")
    for i, entry in enumerate(embeddings[:num_entries]):
        print(f"\nEntry {i + 1}:")
        print(f"Image Path: {entry['image_path']}")
        print(f"Image Embedding: {entry['image_embedding'][:5]}...")  #print the first few values of the embedding
        print(f"Text Embedding: {entry['text_embedding'][:5]}...")
        print(f"Caption: {entry['caption']}")

# Path to the saved train embeddings file
save_path = '/content/drive/MyDrive/Colab Notebooks/thesis/embeddings/labse/labse_train_embeddings.pth'

#load the embeddings
labse_train_embeddings = torch.load(save_path)

preview_embeddings(labse_train_embeddings)

In [None]:
collect_and_save_embeddings(test_loader_2016, image_model, text_model, tokenizer, device, os.path.join(save_dir, 'labse_test_2016_embeddings.pth'))

Processing batches: 100%|██████████| 16/16 [11:58<00:00, 44.88s/it]


Saved embeddings to /content/drive/MyDrive/Colab Notebooks/thesis/embeddings/labse/labse_test_2016_embeddings.pth


In [None]:
# Path to the saved train embeddings file
save_path = '/content/drive/MyDrive/Colab Notebooks/thesis/embeddings/labse/labse_test_2016_embeddings.pth'

#load the embeddings
labse_test_2016_embeddings = torch.load(save_path)

preview_embeddings(labse_test_2016_embeddings)

Total number of embeddings: 1000

Entry 1:
Image Path: 1007129816.jpg
Image Embedding: tensor([ 0.5273,  0.0310,  0.7612, -0.5669, -0.3145], dtype=torch.float16)...
Text Embedding: tensor([-0.0741,  0.2546, -0.1395, -0.0054,  0.2642])...
Caption: ஒரு ஆரஞ்சு தொப்பி ஒரு மனிதன் ஏதாவது நடித்தார்.

Entry 2:
Image Path: 1009434119.jpg
Image Embedding: tensor([ 5.5029e-01,  8.1494e-01,  2.8125e-01, -3.5645e-01,  3.8910e-04],
       dtype=torch.float16)...
Text Embedding: tensor([ 0.4257,  0.3573,  0.0326, -0.2773, -0.0328])...
Caption: ஒரு போஸ்டன் டெரியர் ஒரு வெள்ளை வேலி முன் பசுமையான புல் இயங்கும்.

Entry 3:
Image Path: 101362133.jpg
Image Embedding: tensor([ 0.0176, -0.3918,  0.5522,  0.7544, -0.5361], dtype=torch.float16)...
Text Embedding: tensor([-0.0120, -0.4445,  0.1994,  0.0087, -0.0450])...
Caption: கரேட் சீருடையில் ஒரு பெண் முன் ஒரு கிக் ஒரு குச்சி உடைத்து.

Entry 4:
Image Path: 102617084.jpg
Image Embedding: tensor([ 0.5654,  0.6143,  0.3904,  0.0358, -0.3350], dtype=torch.float16)

In [None]:
collect_and_save_embeddings(val_loader, image_model, text_model, tokenizer, device, os.path.join(save_dir, 'labse_val_embeddings.pth'))

Processing batches: 100%|██████████| 16/16 [13:54<00:00, 52.14s/it]

Saved embeddings to /content/drive/MyDrive/Colab Notebooks/thesis/embeddings/labse/labse_val_embeddings.pth





In [None]:
def preview_embeddings(embeddings, num_entries=5):
    print(f"Total number of embeddings: {len(embeddings)}")
    for i, entry in enumerate(embeddings[:num_entries]):
        print(f"\nEntry {i + 1}:")
        print(f"Image Path: {entry['image_path']}")
        print(f"Image Embedding: {entry['image_embedding'][:5]}...")  #print the first few values of the embedding
        print(f"Text Embedding: {entry['text_embedding'][:5]}...")
        print(f"Caption: {entry['caption']}")

In [None]:
# Path to the saved train embeddings file
save_path = '/content/drive/MyDrive/Colab Notebooks/thesis/embeddings/labse/labse_val_embeddings.pth'

#load the embeddings
labse_val_embeddings = torch.load(save_path)

preview_embeddings(labse_val_embeddings)

Total number of embeddings: 1014

Entry 1:
Image Path: 1018148011.jpg
Image Embedding: tensor([ 0.1012,  0.4971, -0.6328,  0.7017,  0.4465], dtype=torch.float16)...
Text Embedding: tensor([-0.1718,  0.2804, -0.1001,  0.4348,  0.2444])...
Caption: ஆண்கள் ஒரு குழு ஒரு டிரக் மீது பருத்தி ஏற்றும்

Entry 2:
Image Path: 1029450589.jpg
Image Embedding: tensor([ 0.9038,  1.3730,  0.2546,  0.8823, -0.1313], dtype=torch.float16)...
Text Embedding: tensor([-0.0818,  0.8853,  0.2320, -0.4320, -0.2134])...
Caption: ஒரு படுக்கையில் ஒரு பச்சை அறையில் ஒரு மனிதன் தூங்கி.

Entry 3:
Image Path: 1029737941.jpg
Image Embedding: tensor([ 0.7334, -0.0198, -0.1046,  1.1055, -0.2297], dtype=torch.float16)...
Text Embedding: tensor([ 0.3376,  0.0708,  0.0562, -0.4094, -0.0230])...
Caption: ஹெட்ஃபோன்களை அணிந்து கொண்டிருக்கும் ஒரு பெண் ஒரு பெண் மற்றும் அபோஸ் தோள்களில் அமர்ந்துள்ளார்.

Entry 4:
Image Path: 103205630.jpg
Image Embedding: tensor([ 0.6084,  0.4980, -0.8193,  0.0561, -0.5044], dtype=torch.float16)...
