In [8]:
# Convert raw captions into dataframe

data = []

with open("/content/captions.txt", "r") as f:
    lines = f.readlines()

skipped = 0
for line in lines:
    parts = line.strip().split(" ",1)
    if len(parts) != 2:
        skipped += 1
        print(f"Skipping malformed line: {line}")
        continue  # or log: print(f"Skipping malformed line: {line}")
    image_id, caption = parts
    image = image_id.split("#")[0]
    data.append((image, caption.strip()))

print(f"Skipped {skipped} malformed lines.")

import pandas as pd
df = pd.DataFrame(data, columns=["image", "caption"])
df.head()


Skipping malformed line: image,caption

Skipping malformed line: 2199200615.jpg,

Skipped 2 malformed lines.


Unnamed: 0,image,caption
0,"1000092795.jpg,",Two young guys with shaggy hair look at their ...
1,"1000092795.jpg,""","Two young , White males are outside near many ..."
2,"1000092795.jpg,",Two men in green shirts are standing in a yard .
3,"1000092795.jpg,",A man in a blue shirt standing in a garden .
4,"1000092795.jpg,",Two friends enjoy time spent together .


In [9]:
import pandas as pd

# 1. Load captions
captions = pd.read_csv("/content/Flickr8k.token.txt", sep="\t", names=["image_id", "caption"])
captions["image"] = captions["image_id"].apply(lambda x: x.split("#")[0])

# 2. Load split image lists
def load_image_list(filepath):
    with open(filepath, "r") as f:
        return [line.strip() for line in f.readlines()]

train_imgs = load_image_list("/content/Flickr_8k.trainImages.txt")
val_imgs = load_image_list("/content/Flickr_8k.devImages.txt")
test_imgs = load_image_list("/content/Flickr_8k.testImages.txt")

# 3. Filter captions by split
train_df = captions[captions["image"].isin(train_imgs)]
val_df = captions[captions["image"].isin(val_imgs)]
test_df = captions[captions["image"].isin(test_imgs)]

# 4. Save CSVs
train_df.to_csv("flickr8k_train.csv", index=False)
val_df.to_csv("flickr8k_val.csv", index=False)
test_df.to_csv("flickr8k_test.csv", index=False)

In [9]:
df.to_csv("flickr30k_all_captions.csv",index=False)

In [10]:
# split into Train/Validation/Test (80/10/10)

from sklearn.model_selection import train_test_split

unique_images = df["image"].unique()
train_imgs, temp_imgs = train_test_split(unique_images, test_size=0.2, random_state=42)
val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.5, random_state=42)

def save_split(ids, name):
    split_df = df[df["image"].isin(ids)]
    split_df.to_csv(f"flickr30k_{name}.csv", index=False)

save_split(train_imgs, "train")
save_split(val_imgs, "val")
save_split(test_imgs, "test")

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
flickr8k_dir = "/content/drive/MyDrive/CapstoneProject/Datasets/Flickr8k_images"
flickr8k_csv_dir = "/content/drive/MyDrive/CapstoneProject/Datasets/Flickr8k_text"
flickr30k_dir = "/content/drive/MyDrive/CapstoneProject/Datasets/Flickr30k_images"
flickr30k_csv_dir = "/content/drive/MyDrive/CapstoneProject/Datasets/Flickr30k_captions"
output_dir = "/content/drive/MyDrive/CapstoneProject/Embeddings"

In [3]:
!pip install transformers ftfy
import torch
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [4]:
from PIL import Image
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.get_image_features(**inputs)
    return outputs.cpu().numpy().squeeze()

def get_text_embeddings(captions):
    inputs = processor(text=captions, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = model.get_text_features(**inputs)
    return outputs.cpu().numpy()

In [5]:
def generate_clip_embeddings(csv_file, img_dir, split_name, output_dir):
    df = pd.read_csv(csv_file)
    grouped = df.groupby("image")

    image_embs = []
    text_embs = []

    for img_name, group in tqdm(grouped):
        img_path = os.path.join(img_dir, img_name)
        if not os.path.exists(img_path):
            continue
        try:
            image_emb = get_image_embedding(img_path)
            text_emb = get_text_embeddings(group["caption"].tolist())

            image_embs.append(image_emb)
            text_embs.extend(text_emb)
        except Exception as e:
            print(f"Error with {img_name}: {e}")
            continue

    np.save(os.path.join(output_dir, f"{split_name}_image_embs.npy"), image_embs)
    np.save(os.path.join(output_dir, f"{split_name}_text_embs.npy"), text_embs)

In [24]:
print(flickr8k_csv_dir)

/content/drive/MyDrive/CapstoneProject/Datasets/Flickr8k_text


In [26]:
generate_clip_embeddings(
    csv_file= flickr30k_csv_dir + "/flickr30k_train.csv",
    img_dir=f"{flickr30k_dir}/images",
    split_name="flickr30k_train",
    output_dir=output_dir
)

generate_clip_embeddings(
    csv_file= flickr30k_csv_dir + "/flickr30k_val.csv",
    img_dir=f"{flickr30k_dir}/images",
    split_name="flickr30k_val",
    output_dir=output_dir
)

generate_clip_embeddings(
    csv_file= flickr30k_csv_dir + "/flickr30k_test.csv",
    img_dir=f"{flickr30k_dir}/images",
    split_name="flickr30k_test",
    output_dir=output_dir
)

100%|██████████| 36037/36037 [00:05<00:00, 6332.76it/s]
100%|██████████| 4505/4505 [00:00<00:00, 5782.00it/s]
100%|██████████| 4505/4505 [00:00<00:00, 5591.08it/s]
