In [None]:

# Image Caption Preprocessing Script

"""
Read COCO captions JSON, build tokenizer, create sequences and save:
- tokenizer.json
- captions_data.npz
"""

import os, json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict

# USER MUST SET THESE:
ANNOTATIONS_DIR = "/path/to/coco/annotations"
FEATURE_DIR     = "/path/to/features"
VOCAB_SIZE      = 10000
MAX_LEN         = 50
SUBSET          = None

try:
    OUT_DIR = os.path.dirname(__file__)
except NameError:
    OUT_DIR = os.getcwd()

TOKENIZER_PATH = os.path.join(OUT_DIR, "tokenizer.json")
CAPTION_DATA_PATH = os.path.join(OUT_DIR, "captions_data.npz")

# Load annotations
ann_file = None
for fn in os.listdir(ANNOTATIONS_DIR):
    if "captions_train" in fn:
        ann_file = os.path.join(ANNOTATIONS_DIR, fn)
        break

if not ann_file:
    raise FileNotFoundError("captions_train*.json not found")

with open(ann_file, "r", encoding="utf-8") as f:
    coco = json.load(f)

# Build idâ†’captions
id2caps = defaultdict(list)
for ann in coco["annotations"]:
    img_id = ann["image_id"]
    caption = "<start> " + ann["caption"].strip().lower() + " <end>"
    id2caps[img_id].append(caption)

img_id_to_fname = {img["id"]: img["file_name"] for img in coco["images"]}

pairs = []
for img_id, fname in img_id_to_fname.items():
    feat_file = os.path.splitext(fname)[0] + ".npy"
    feat_path = os.path.join(FEATURE_DIR, feat_file)
    if os.path.exists(feat_path):
        for cap in id2caps.get(img_id, []):
            pairs.append((img_id, fname, cap))

if SUBSET:
    pairs = pairs[:SUBSET]

all_captions = [p[2] for p in pairs]

tokenizer = Tokenizer(
    num_words=VOCAB_SIZE,
    oov_token="<unk>",
    filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~\t\n'
)
tokenizer.fit_on_texts(all_captions)

with open(TOKENIZER_PATH, "w", encoding="utf-8") as f:
    f.write(tokenizer.to_json())

sequences = tokenizer.texts_to_sequences(all_captions)
max_len = min(MAX_LEN, max(len(seq) for seq in sequences))
padded = pad_sequences(sequences, maxlen=max_len, padding="post")

img_fnames = [p[1] for p in pairs]

np.savez_compressed(
    CAPTION_DATA_PATH,
    padded=padded,
    img_fnames=np.array(img_fnames)
)
