# Image Captioning using CNNs & LSTMs

## Setup notebook


In [2]:
import math
from pathlib import Path

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

from models import EncoderCNN, DecoderRNN
from utils import train, test


In [3]:
torch.manual_seed(0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.__version__)
print(device)


1.9.0+cu111
cuda:0


## COCO Dataset


### Download & extract the dataset


In [4]:
DATASET_URLS = [
    "http://images.cocodataset.org/zips/train2014.zip",
    "http://images.cocodataset.org/zips/val2014.zip",
    "http://images.cocodataset.org/zips/test2014.zip"
]
DATASET_DIR = Path("./datasets/coco")
MODEL_DIR = Path("./checkpoints/coco")
RESULTS_DIR = Path("./results/coco")

DATASET_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

for url in DATASET_URLS:
    name, stem = Path(url).name, Path(url).stem
    if (DATASET_DIR / stem).exists():
        continue

    !wget -P {DATASET_DIR} {url}
    !tar -xf {DATASET_DIR / name} --directory {DATASET_DIR}


Downloading test2014.zip...


In [None]:
BATCH_SIZE = 32
EPOCHS = 1
HIDDEN_SIZE = 512  # Num of features in hidden state of the RNN decoder
EMBED_SIZE = 512  # dimensionality of word embeddings
vocab_threshold = 6  # minimum word count threshold
vocab_from_file = True  # if True, load existing vocab file


### Setup training and testing sets

* We will normalize the data using the mean and stddev of the ImageNet dataset


In [None]:
transform = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
            (0.485, 0.456, 0.406),
            (0.229, 0.224, 0.225),
        ),
    ]
)


train_dataset = datasets.ImageFolder(MIAS_DATASET_DIR, transform)
train_dataset = datasets.ImageFolder(MIAS_DATASET_DIR, transform)

# ? Create dataset loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Build data loader.
data_loader = get_loader(
    transform=transform,
    mode="train",
    batch_size=batch_size,
    vocab_threshold=vocab_threshold,
    vocab_from_file=vocab_from_file,
)

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)



# Set the total number of training steps per epoch.
# total_step = math.ceil(
#     len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size
# )


## Build & Train the Encoder-Decoder Model


In [None]:
encoder = EncoderCNN(embed_size).to(device)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size).to(device)

params = (
    list(decoder.parameters())
    + list(encoder.embed.parameters())
    + list(encoder.bn.parameters())
)
optimizer = optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08)

criterion = nn.CrossEntropyLoss().to(device)

train_losses = train(
    model,
    train_loader,
    optimizer=optimizer,
    criterion=criterion,
    epochs=100,
    device=device,
    save_dir=MODEL_DIR,
)
