In [None]:
import torch
import numpy as np
import random
import warnings
import os
import json
from PIL import Image
import torchvision.transforms as transforms
from torchvision.transforms.functional import InterpolationMode

warnings.filterwarnings("ignore")
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

In [None]:
import kagglehub

path = kagglehub.dataset_download("nagasai524/mini-coco2014-dataset-for-image-captioning")
print(f"Path to dataset files: {path}")

In [None]:
dataset_path = path
json_file = os.path.join(dataset_path, "captions.json")

with open(json_file, "r") as f:
    data = json.load(f)

annotations = data["annotations"] if isinstance(data, dict) and "annotations" in data else data

captions_dict = {}
for ann in annotations:
    img_id = ann["image_id"]
    if img_id not in captions_dict:
        captions_dict[img_id] = []
    captions_dict[img_id].append(ann["caption"])

image_dir = None
for root, dirs, files in os.walk(path):
    if any(f.endswith(".jpg") for f in files):
        image_dir = root
        break

print(f"Loaded {len(captions_dict)} images with captions")
print(f"Image directory: {image_dir}")

In [None]:
def load_coco_image(image_id, size=384):
    img_path = os.path.join(image_dir, f"COCO_train2014_{image_id:012d}.jpg")
    if not os.path.exists(img_path):
        img_path = os.path.join(image_dir, f"{image_id}.jpg")
    
    img = Image.open(img_path).convert("RGB").resize((size, size))
    return img


def get_coco_dataset(id=3, for_attention=False):
    image_ids = list(captions_dict.keys())
    img_id = image_ids[id]
    
    img = load_coco_image(img_id, size=384)
    captions = captions_dict[img_id]
    
    if for_attention:
        transform = transforms.ToTensor()
        return transform(img)
    else:
        return None, captions, img


def divide_list(lst, num_chunks=5):
    chunk_size = len(lst) // num_chunks
    remainder = len(lst) % num_chunks
    result = []
    start = 0
    for i in range(num_chunks):
        end = start + chunk_size
        if remainder > 0:
            end += 1
            remainder -= 1
        result.append(lst[start:end])
        start = end
    return result


def load_image(id, image_size=384, device=device, before=True, dataset="coco"):
    if dataset != "coco":
        raise ValueError(f"Dataset {dataset} not supported")
    
    _, _, raw_image = get_coco_dataset(id)
    
    transform_pil_tensor = transforms.Compose([
        transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
        transforms.PILToTensor(),
    ])

    transform_tensor = transforms.Compose([
        transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize(
            (0.48145466, 0.4578275, 0.40821073), 
            (0.26862954, 0.26130258, 0.27577711)
        )
    ])

    if before:
        image = transform_pil_tensor(raw_image).to(device) 
    else:
        image = transform_tensor(raw_image).unsqueeze(0).to(device) 

    return image

In [None]:
_, captions, img = get_coco_dataset(id=0)
print(f"Image type: {type(img)}, Size: {img.size}")
print(f"First caption: {captions[0]}")

test_list = list(range(25))
divided = divide_list(test_list, num_chunks=5)
print(f"divide_list test: {len(divided)} chunks")

tensor_img = load_image(id=0, image_size=384, device=device, before=True)
print(f"load_image output shape: {tensor_img.shape}")