1. Data Preparation

In [1]:
import os
import zipfile
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import torchvision.transforms as transforms
from torch.optim import Adam
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Path to your zip file
zip_path = os.path.expanduser('~/Downloads/archive(1).zip')
extract_dir = './flickr8k_data'

# Extract only if not already done
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

print("Extracted to:", extract_dir)
print("Contents:", os.listdir(extract_dir))

In [2]:
extract_dir = '/Users/sfowler14/Downloads/archive'
print("Extracted to:", extract_dir)
print("Contents:", os.listdir(extract_dir))

Extracted to: /Users/sfowler14/Downloads/archive
Contents: ['captions.txt', 'Images']


In [3]:
captions_path = os.path.join(extract_dir, 'captions.txt')
df = pd.read_csv(captions_path)

print(df.head())
print(f"\nNumber of unique images: {df['image'].nunique()}")

                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  

Number of unique images: 8091


2. Image Feature Extraction

In [4]:
# --- Image transform (standard ImageNet normalization) ---
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# --- Dataset class for images ---
class FlickrImageDataset(Dataset):
    def __init__(self, image_dir, image_filenames, transform=None):
        self.image_dir = image_dir
        self.image_filenames = list(image_filenames)
        self.transform = transform

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, idx):
        img_name = self.image_filenames[idx]
        path = os.path.join(self.image_dir, img_name)
        # wrap in try-except to catch corrupt images
        try:
            image = Image.open(path).convert("RGB")
        except Exception as e:
            # if image fails to open, create a black image instead and log
            print(f"Failed to open {path}: {e}")
            image = Image.new('RGB', (224,224))
        if self.transform:
            image = self.transform(image)
        return image, img_name

In [5]:
# --- Model setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*(list(resnet.children())[:-1]))  # remove FC layer
resnet.to(device)
resnet.eval()

# --- Create dataset and dataloader ---
image_dir = os.path.join(extract_dir, "Images")
unique_images = df["image"].unique()
image_dataset = FlickrImageDataset(image_dir, unique_images, transform)
image_loader = DataLoader(image_dataset, batch_size=32, shuffle=False, num_workers=2)



In [6]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

# test model forward
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*(list(resnet.children())[:-1])).to(device).eval()

In [7]:
image_filenames = sorted(os.listdir(image_dir))   # better to use df['image'].unique() if you want same order
dataset = FlickrImageDataset(image_dir, image_filenames, transform)
loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0)  # num_workers=0 is safest

image_features_list = []
image_names = []

In [8]:
with torch.no_grad():
    for imgs, names in tqdm(loader, desc="Extracting image features"):
        imgs = imgs.to(device)
        feats = resnet(imgs)               # (B, 2048, 1, 1)
        feats = feats.view(feats.size(0), -1)  # (B, 2048)
        image_features_list.append(feats.cpu())
        image_names.extend(names)

image_features = torch.cat(image_features_list, dim=0).numpy()
print("Done. image_features shape:", image_features.shape)

Extracting image features: 100%|██████████| 253/253 [11:36<00:00,  2.75s/it]


Done. image_features shape: (8091, 2048)


3. Caption Feature Extraction

In [9]:
from transformers import BertTokenizer, BertModel

# --- Device and model setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
bert.to(device)
bert.eval()

# --- Dataset for captions ---
class FlickrCaptionDataset(Dataset):
    def __init__(self, captions):
        self.captions = captions
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.captions)

    def __getitem__(self, idx):
        text = self.captions[idx]
        # Return plain text; we’ll tokenize in collate_fn for batching
        return text

# --- Custom collate_fn to batch tokenize ---
def collate_fn(batch_texts):
    return tokenizer(batch_texts, return_tensors='pt',
                     truncation=True, padding=True, max_length=64)


2025-11-03 14:25:55.733525: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# --- Create dataset and dataloader ---
caption_dataset = FlickrCaptionDataset(df["caption"].tolist())
caption_loader = DataLoader(
    caption_dataset,
    batch_size=32,         
    shuffle=False,
    num_workers=0,         
    collate_fn=collate_fn  
)

In [11]:
# --- Extract features ---
caption_features = []

with torch.no_grad():
    for batch in tqdm(caption_loader, desc="Extracting caption features"):
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = bert(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # (B, 768)
        caption_features.append(cls_embeddings.cpu())

caption_features = torch.cat(caption_features, dim=0).numpy()

print("Caption features shape:", caption_features.shape)

Extracting caption features: 100%|██████████| 1265/1265 [14:39<00:00,  1.44it/s]


Caption features shape: (40455, 768)


4. Data Splitting

In [12]:
# Map image filename to its index
image_to_idx = {name: i for i, name in enumerate(image_names)}

# For each caption row, find which image it corresponds to
caption_to_image_idx = df["image"].map(image_to_idx).values

print("caption_to_image_idx shape:", caption_to_image_idx.shape)
print("Example mapping:", list(zip(df['caption'][:3], caption_to_image_idx[:3])))

caption_to_image_idx shape: (40455,)
Example mapping: [('A child in a pink dress is climbing up a set of stairs in an entry way .', 0), ('A girl going into a wooden building .', 0), ('A little girl climbing into a wooden playhouse .', 0)]


In [13]:
np.save("flickr8k_image_features.npy", image_features)
np.save("flickr8k_caption_features.npy", caption_features)
np.save("flickr8k_caption_to_image.npy", caption_to_image_idx)
np.save("flickr8k_image_names.npy", np.array(image_names))

In [14]:
# Split on images
n_images = len(image_names)
indices = np.arange(n_images)

In [15]:
# 70/15/15 split
train_idx, temp_idx = train_test_split(indices, test_size=0.30, random_state=42, shuffle=True)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42, shuffle=True)
print(f"Train images: {len(train_idx)}, Val: {len(val_idx)}, Test: {len(test_idx)}")

Train images: 5663, Val: 1214, Test: 1214


In [16]:
# Create masks for captions 
caption_to_image_idx = caption_to_image_idx.astype(int)

train_mask = np.isin(caption_to_image_idx, train_idx)
val_mask = np.isin(caption_to_image_idx, val_idx)
test_mask = np.isin(caption_to_image_idx, test_idx)

# Split image features 
image_train = image_features[train_idx]
image_val   = image_features[val_idx]
image_test  = image_features[test_idx]

# Split captions (and keep their alignment)
caption_train = caption_features[train_mask]
caption_val   = caption_features[val_mask]
caption_test  = caption_features[test_mask]

In [17]:
# Link captions to local image indices within each split
def remap_caption_indices(global_indices, split_indices):
    """
    Convert global image indices in caption_to_image_idx to 0..len(split_indices)-1 within that split.
    """
    mapping = {g: i for i, g in enumerate(split_indices)}
    return np.array([mapping[i] for i in global_indices if i in mapping])

In [18]:
caption_to_train_img = remap_caption_indices(caption_to_image_idx[train_mask], train_idx)
caption_to_val_img   = remap_caption_indices(caption_to_image_idx[val_mask], val_idx)
caption_to_test_img  = remap_caption_indices(caption_to_image_idx[test_mask], test_idx)

In [19]:
# Sanity check
print("Train split shapes:")
print("  Image features:", image_train.shape)
print("  Caption features:", caption_train.shape)
print("  Caption→Image indices:", caption_to_train_img.shape)

Train split shapes:
  Image features: (5663, 2048)
  Caption features: (28315, 768)
  Caption→Image indices: (28315,)


In [20]:
np.save("train_image_features.npy", image_train)
np.save("val_image_features.npy", image_val)
np.save("test_image_features.npy", image_test)

np.save("train_caption_features.npy", caption_train)
np.save("val_caption_features.npy", caption_val)
np.save("test_caption_features.npy", caption_test)

np.save("train_caption_to_image.npy", caption_to_train_img)
np.save("val_caption_to_image.npy", caption_to_val_img)
np.save("test_caption_to_image.npy", caption_to_test_img)

## Data Preparation Complete!

The following .npy files have been saved:

**Full dataset:**
- `flickr8k_image_features.npy`
- `flickr8k_caption_features.npy`
- `flickr8k_caption_to_image.npy`
- `flickr8k_image_names.npy`

**Train/Val/Test splits:**
- `train_image_features.npy`, `train_caption_features.npy`, `train_caption_to_image.npy`
- `val_image_features.npy`, `val_caption_features.npy`, `val_caption_to_image.npy`
- `test_image_features.npy`, `test_caption_features.npy`, `test_caption_to_image.npy`

These files can now be loaded by architecture notebooks (e.g., Architecture_1.ipynb, Cross_Modal_Autoencoder.ipynb).