# Visualizing Attention

## Dependencies and imports

In [1]:
!pip install wandb pytorch_lightning tokenizers shap -qqq

In [2]:
import os
import sys
import io
import pickle
import warnings
from copy import deepcopy

warnings.filterwarnings('ignore')

import torch
from PIL import Image
from torchvision import transforms
import ipywidgets as widgets

Clone the project from Github:

In [3]:
!git clone --depth 1 https://github.com/reppertj/image-captioning.git

Cloning into 'image-captioning'...
remote: Enumerating objects: 60, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 60 (delta 3), reused 44 (delta 3), pack-reused 0[K
Unpacking objects: 100% (60/60), done.


And import from the cloned project:

In [4]:
sys.path.append("image-captioning")
from project.captioners import CaptioningRNN
from project.datasets import CombinedDataModule
from project.visualization import visualize_minibatch_weights

We want to make sure we're running on a GPU.

In [5]:
if torch.cuda.is_available:
  print('Yes, running on GPU!')
else:
  print('To use a GPU in Colab, select Runtime -> Change Runtime Type -> GPU')

Yes, running on GPU!


## Model

Run this cell to download pretrained Resnext50 weights to use in the encoder and combine them with some decoder weights from the Github repo.

In [6]:
datamodule = CombinedDataModule(
    flickr_csv=os.path.join("image-captioning", "tests", "test_data", "test_flickr.csv"),
    flickr_dir=os.path.join("image-captioning", "tests", "test_data", "test_flickr_images"),
    batch_size=4,
    val_size=4,
    test_size=4,
    transform="normalize",
    target_transform="tokenize",
    dev_set=12,
    num_workers=0,
    pretrained_vocab='image-captioning/tokenizer/vocab-2000.txt'
)
datamodule.setup()
datamodule
with open('image-captioning/models/demo-model/demo-hparams.pkl', 'rb') as f:
    hparams = pickle.load(f)
with open('image-captioning/models/demo-model/decoder-weights.pkl', 'rb') as f:
    decoder_weights = torch.load(f)
model = CaptioningRNN(datamodule=datamodule, config=hparams)
existing_states = deepcopy(model.state_dict())
for key in decoder_weights.keys():
    existing_states[key] = decoder_weights[key]
model.load_state_dict(existing_states)
model.to('cuda')
model.batch_size = 4
model.eval();

## Predict and visualize

Run this cell to upload your own images and try different values for the beam width and beam alpha. A larger beam width takes longer but may generate higher quality captions; a larger alpha tends to favor shorter captions.

In [7]:
@widgets.interact(
    demo_image_name=widgets.Select(options=['giraffe', 'guitar', 'horse_riding', 'surfing'], value='giraffe', disabled=False),
    beam_width=widgets.IntSlider(value=10, min=1, max=40, step=1, continuous_update=False),
    alpha=widgets.FloatSlider(value=1.5, min=0, max=10, step=0.1, continuous_update=False))
def show(demo_image_name, beam_width=10, alpha=1.5):
    model.inference_beam_width = beam_width
    model.inference_beam_alpha = alpha
    img = Image.open(f"image-captioning/notebooks/sample-images/{demo_image_name}.jpeg").convert("RGB")
    xform = transforms.Compose(
            [
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(
                    (0.4435, 0.4201, 0.3837), (0.2814, 0.2734, 0.2820)
                ),
            ]
        )
    img = xform(img)
    mb = {}
    mb['image'] = img.unsqueeze(0).to('cuda')
    visualize_minibatch_weights(mb, model)

interactive(children=(Select(description='demo_image_name', options=('giraffe', 'guitar', 'horse_riding', 'sur…

In [8]:
@widgets.interact(
    f=widgets.FileUpload(accept='image/*', multiple=False),
    beam_width=widgets.IntSlider(value=10, min=1, max=40, step=1, continuous_update=False),
    alpha=widgets.FloatSlider(value=1.5, min=0, max=10, step=0.1, continuous_update=False))
def show(f=None, beam_width=10, alpha=1.5):
    for key, val in f.items():
        model.inference_beam_width = beam_width
        model.inference_beam_alpha = alpha
        img = Image.open(io.BytesIO(val['content'])).convert("RGB")
        xform = transforms.Compose(
                [
                    transforms.Resize((224, 224)),
                    transforms.ToTensor(),
                    transforms.Normalize(
                        (0.4435, 0.4201, 0.3837), (0.2814, 0.2734, 0.2820)
                    ),
                ]
            )
        img = xform(img)
        mb = {}
        mb['image'] = img.unsqueeze(0).to('cuda')
        visualize_minibatch_weights(mb, model)

interactive(children=(FileUpload(value={}, accept='image/*', description='Upload'), IntSlider(value=10, contin…