# Initialization

## Imports

In [1]:
import torch
import torchvision

import json
import os
import requests
import zipfile
from pathlib import Path

import matplotlib.pyplot as plt
from PIL import Image

## Agnostics

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
print("pytroch version:", torch.__version__)
print("torchvision version:", torchvision.__version__)

pytroch version: 2.10.0.dev20251120+cu130
torchvision version: 0.25.0.dev20251121+cu130


## Downlaoding the Dataset

In [4]:
def get_data(link: str, folder_name: str=None, data_path: Path=None):    
    data_path = Path("data/") if not data_path else data_path
    if not folder_name:
        folder_name = link.split("/")[-1].split(".")[0]
    folder_path = Path(data_path / folder_name)
    print(folder_path)
    
    if folder_path.is_dir():
        print("data already downloaded in place.")
    else:
        print('downloading data')
        folder_path.mkdir(parents=True, exist_ok=True)

        with open(str(folder_path) + ".zip", "wb") as f:
            request = requests.get(link)
            print("download finished. extracting zip file")
            f.write(request.content)
    
        with zipfile.ZipFile(str(folder_path) + ".zip", "r") as zipref:
            zipref.extractall(folder_path)
            print("finished extraction")
    
        os.remove(str(folder_path))
        print("removed zip file")

    return folder_path

In [5]:
IMG_DIR = get_data(link="http://images.cocodataset.org/zips/val2017.zip", folder_name="coco2017val")
ANN_PATH = get_data(link="http://images.cocodataset.org/annotations/annotations_trainval2017.zip", folder_name="coco2017annot")

data/coco2017val
data already downloaded in place.
data/coco2017annot
data already downloaded in place.


In [6]:
IMG_DIR = IMG_DIR / "val2017"
ANN_PATH = ANN_PATH / "annotations/captions_train2017.json"
IMG_DIR, ANN_PATH

(PosixPath('data/coco2017val/val2017'),
 PosixPath('data/coco2017annot/annotations/captions_train2017.json'))

## Load full annotation file

In [7]:
with open(ANN_PATH, "r") as f:
    coco = json.load(f)

annotations_full = coco["annotations"]

In [8]:
annotations_full[:5]

[{'image_id': 203564,
  'id': 37,
  'caption': 'A bicycle replica with a clock as the front wheel.'},
 {'image_id': 322141,
  'id': 49,
  'caption': 'A room with blue walls and a white sink and door.'},
 {'image_id': 16977,
  'id': 89,
  'caption': 'A car that seems to be parked illegally behind a legally parked car'},
 {'image_id': 106140,
  'id': 98,
  'caption': 'A large passenger airplane flying through the air.'},
 {'image_id': 106140,
  'id': 101,
  'caption': 'There is a GOL plane taking off in a partly cloudy sky.'}]

## Taking the first 500 samples

In [9]:
image_captions = {}
for ann in annotations_full:
    img_id = ann["image_id"]
    caption = ann["caption"]

    if len(image_captions) >= 500:
        break
    if img_id not in image_captions:
        image_captions[img_id] = []

    image_captions[img_id].append(caption)

In [12]:
print(len(image_captions))
image_captions[16977]

500


['A car that seems to be parked illegally behind a legally parked car',
 'two cars parked on the sidewalk on the street',
 'City street with parked cars and a bench.',
 'Cars try to maneuver into parking spaces along a densely packed city street. ',
 'A couple of cars parked in a busy street sidewalk.']

## Create a list of samples