In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import numpy as np
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer
from collections import defaultdict

# Paths
FLICKR_IMAGE_DIR = "/content/drive/MyDrive/Stat_Learning_Project/Flickr30K/images/flickr30k-images"
FLICKR_TOKEN_FILE = "/content/drive/MyDrive/Stat_Learning_Project/Flickr30K/results_20130124.token"

OUTPUT_IMAGE_EMBEDDINGS = "/content/drive/MyDrive/Stat_Learning_Project/Flickr30K/flickr_image_embeddings.npy"
OUTPUT_CAPTIONS = "/content/drive/MyDrive/Stat_Learning_Project/Flickr30K/flickr_captions.npy"

# Initialize CLIP model and processor
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [None]:

TAR_FILE_PATH = "/content/drive/MyDrive/Stat_Learning_Project/Flickr30K/flickr30k-images.tar"

EXTRACTION_DIR = "/content/drive/MyDrive/Stat_Learning_Project/Flickr30K/images/flickr30k-images"

# Extract if not already extracted
if not os.path.exists(EXTRACTION_DIR):
    os.makedirs(EXTRACTION_DIR)
    with tarfile.open(TAR_FILE_PATH, 'r') as tar:
        tar.extractall(path=EXTRACTION_DIR)
        print(f"Extracted images to {EXTRACTION_DIR}")
else:
    print(f"Images already extracted in {EXTRACTION_DIR}")

In [None]:
def parse_flickr30k(token_file_path):
    captions_dict = defaultdict(list)
    with open(token_file_path, "r") as file:
        for line in file:
            caption_id, caption = line.strip().split("\t")
            image_id = caption_id.split("#")[0]
            captions_dict[image_id].append(caption)
    return captions_dict

def load_image_and_caption(image_dir, image_id, caption):
    try:
        image_path = os.path.join(image_dir, image_id)
        image = Image.open(image_path).convert("RGB")
        return image, caption
    except Exception as e:
        print(f"Error loading {image_id}: {e}")
        return None, None

def convert_flickr_to_npy(image_dir, captions_dict, output_image_path, output_caption_path, batch_size=32):
    image_embeddings = []
    tokenized_captions = []

    image_ids = list(captions_dict.keys())
    all_captions = [captions_dict[image_id][0] for image_id in image_ids]

    loaded_images = [None] * len(image_ids)
    loaded_captions = [None] * len(image_ids)

    print("Loading images and captions in parallel...")
    with ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(load_image_and_caption, image_dir, image_ids[i], all_captions[i]): i
            for i in range(len(image_ids))
        }
        for future in tqdm(futures, total=len(futures), desc="Loading"):
            index = futures[future]
            image, caption = future.result()
            if image and caption:
                loaded_images[index] = image
                loaded_captions[index] = caption

    valid_indices = [i for i in range(len(loaded_images)) if loaded_images[i] is not None]
    loaded_images = [loaded_images[i] for i in valid_indices]
    loaded_captions = [loaded_captions[i] for i in valid_indices]

    print("Processing images and captions in batches...")
    for i in tqdm(range(0, len(loaded_images), batch_size), desc="Batch Processing"):
        batch_images = loaded_images[i:i+batch_size]
        batch_captions = loaded_captions[i:i+batch_size]

        inputs = processor(images=batch_images, return_tensors="pt", padding=True).to(DEVICE)
        with torch.no_grad():
            batch_image_emb = clip_model.get_image_features(**inputs)
            batch_image_emb /= batch_image_emb.norm(dim=-1, keepdim=True)
        image_embeddings.append(batch_image_emb.cpu().numpy())

        tokens = tokenizer(batch_captions, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        tokenized_captions.append(tokens.input_ids.numpy())

    np.save(output_image_path, np.vstack(image_embeddings))
    np.save(output_caption_path, np.vstack(tokenized_captions))

In [None]:
# Run the conversion
print("Parsing captions...")
captions_dict = parse_flickr30k(FLICKR_TOKEN_FILE)

print("Processing images and captions...")
convert_flickr_to_npy(FLICKR_IMAGE_DIR, captions_dict, OUTPUT_IMAGE_EMBEDDINGS, OUTPUT_CAPTIONS)

In [None]:
caption_embeddings = np.load(FLICKR_CAPTIONS)

caption_embeddings_norm = caption_embeddings / np.linalg.norm(caption_embeddings, axis=-1, keepdims=True)

np.save('/content/drive/MyDrive/Stat_Learning_Project/Flickr30K/images/Flickr30k/flickr_captions_normalized.npy', caption_embeddings_norm.astype(np.float32))

print("Flickr30k conversion complete!")