Description
- Input the image
- Output: Describe what the picture is 


Source: https://arxiv.org/pdf/1502.03044.pdf

show attend and tell: neural image caption generation with visual attention

In [1]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle
import gc

from tqdm import tqdm_notebook

In [0]:
annotation_folder = '/annotations/'

if not os.path.exists(os.path.abspath('.') + annotation_folder):
  annotation_zip = tf.keras.utils.get_file('captions.zip', 
                                           cache_subdir=os.path.abspath('.'),
                                           origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
                                           extract=True)
  
  annotation_file = os.path.dirname(annotation_zip) + '/annotations/captions_train2014.json'
  
  os.remove(annotation_zip)

In [9]:
image_folder = '/train2014/'

if not os.path.exists(os.path.abspath('.') + image_folder):
  image_zip = tf.keras.utils.get_file('train2014.zip', 
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',
                                      extract = True)

  PATH = os.path.dirname(image_zip) + image_folder
  os.remove(image_zip)
else:
  PATH = os.path.abspath('.') + image_folder

Downloading data from http://images.cocodataset.org/zips/train2014.zip


## Limit Data-Set Numbers


image caption annotation

```
annotation{
"id": int, 
"image_id": int, 
"caption": str,
}
```

In [0]:
annotation_file = '/content/annotations/captions_train2014.json'

In [0]:
with open(annotation_file, 'r') as f:
  annotations = json.load(f)

In [19]:
annotations['annotations'][:2]

[{'caption': 'A very clean and well decorated empty bathroom',
  'id': 48,
  'image_id': 318556},
 {'caption': 'A panoramic view of a kitchen and all of its appliances.',
  'id': 67,
  'image_id': 116100}]

In [21]:
os.listdir('/content/train2014')[:2]

['COCO_train2014_000000066976.jpg', 'COCO_train2014_000000253219.jpg']

In [0]:
all_captions = []
all_img_name_vector = []

for annot in annotations['annotations']:
  caption = '<start> ' + annot['caption'] + ' <end>'
  image_id = annot['image_id']
  full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)

  all_img_name_vector.append(full_coco_image_path)
  all_captions.append(caption)

In [36]:
all_captions[:2]

['<start> A very clean and well decorated empty bathroom <end>',
 '<start> A panoramic view of a kitchen and all of its appliances. <end>']

In [35]:
all_img_name_vector[:2]

['/content/train2014/COCO_train2014_000000318556.jpg',
 '/content/train2014/COCO_train2014_000000116100.jpg']

In [0]:
train_captions, img_name_vector = shuffle(all_captions, all_img_name_vector, random_state=1)


num_examples = 30000
train_captions = train_captions[:num_examples]
img_name_vector = img_name_vector[:num_examples]

In [39]:
gc.collect()

717

In [40]:
print(len(train_captions))
print(len(img_name_vector))

30000
30000


## Preprocess Image

In [0]:
def load_image(image_path):
  img = tf.io.read_file(image_path)
  img = tf.image.decode_jpeg(img, channels=3)
  img = tf.image.resize(img, (299, 299))
  img = tf.keras.applications.inception_v3.preprocess_input(img)
  return img, image_path

## Initialize Image Encoder

In [42]:
image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [43]:
image_model.input

<tf.Tensor 'input_1:0' shape=(None, None, None, 3) dtype=float32>

In [0]:
image_model.summary()

In [0]:
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

## To perform in Colab Environment, preprocess and cache the output to disk.
## This Work will exceed colab capacity

In [0]:
# Get unique images
encode_train = sorted(set(img_name_vector)) 

# Create Dataset
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

In [51]:
for n, data in enumerate(image_dataset):
  print(data)
  if n == 0:
    break

(<tf.Tensor: shape=(16, 299, 299, 3), dtype=float32, numpy=
array([[[[-0.94172734, -0.9338842 , -0.9730999 ],
         [-0.94509804, -0.9372549 , -0.9764706 ],
         [-0.9372549 , -0.92941177, -0.96862745],
         ...,
         [ 0.09706461,  0.13553667,  0.10441196],
         [-0.25968188, -0.24626416, -0.24372792],
         [-0.36637044, -0.36613643, -0.3023631 ]],

        [[-0.94172734, -0.9338842 , -0.9730999 ],
         [-0.95009506, -0.9422519 , -0.98146766],
         [-0.9379986 , -0.93015546, -0.96937114],
         ...,
         [ 0.4545579 ,  0.477512  ,  0.4600668 ],
         [ 0.38514173,  0.39711416,  0.39318597],
         [ 0.3043332 ,  0.30258572,  0.33447576]],

        [[-0.94172734, -0.9338842 , -0.9730999 ],
         [-0.9524559 , -0.94461274, -0.9838284 ],
         [-0.9384222 , -0.93057907, -0.96979475],
         ...,
         [ 0.6256294 ,  0.6422862 ,  0.63541365],
         [ 0.57628095,  0.5872253 ,  0.5829663 ],
         [ 0.5576018 ,  0.5585724 ,  0.55808

In [53]:
for img, path in tqdm_notebook(image_dataset):
  batch_features = image_features_extract_model(img)
  batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))

  for bf, p in zip(batch_features, path):
    path_of_features = p.numpy().decode('utf-8')  # the array will save by path name
    np.save(path_of_features, bf.numpy())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [72]:
path_of_features

'/content/train2014/COCO_train2014_000000581909.jpg'

## Text Part

In [0]:
def calc_max_length(tensor):
  return max(len(t) for t in tensor)

In [0]:
# Choose top 5000 words from the vocab
top_k = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token='<unk>',
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')

tokenizer.fit_on_texts(train_captions)

train_seqs = tokenizer.texts_to_sequences(train_captions)

In [92]:
tokenizer.word_index  # word: index
tokenizer.index_word  # index: word

{1: '<unk>',
 2: 'a',
 3: '<start>',
 4: '<end>',
 5: 'on',
 6: 'of',
 7: 'the',
 8: 'in',
 9: 'with',
 10: 'and',
 11: 'is',
 12: 'man',
 13: 'to',
 14: 'sitting',
 15: 'an',
 16: 'two',
 17: 'people',
 18: 'at',
 19: 'standing',
 20: 'are',
 21: 'white',
 22: 'next',
 23: 'woman',
 24: 'street',
 25: 'table',
 26: 'that',
 27: 'holding',
 28: 'some',
 29: 'large',
 30: 'it',
 31: 'person',
 32: 'down',
 33: 'top',
 34: 'up',
 35: 'group',
 36: 'field',
 37: 'small',
 38: 'his',
 39: 'plate',
 40: 'black',
 41: 'tennis',
 42: 'near',
 43: 'front',
 44: 'room',
 45: 'dog',
 46: 'young',
 47: 'riding',
 48: 'train',
 49: 'by',
 50: 'red',
 51: 'baseball',
 52: 'water',
 53: 'cat',
 54: 'playing',
 55: 'has',
 56: 'walking',
 57: 'sign',
 58: 'bathroom',
 59: 'while',
 60: 'blue',
 61: 'kitchen',
 62: 'bus',
 63: 'food',
 64: 'there',
 65: 'green',
 66: 'bed',
 67: 'parked',
 68: 'grass',
 69: 'pizza',
 70: 'looking',
 71: 'snow',
 72: 'other',
 73: 'ball',
 74: 'beach',
 75: 'side',
 76

In [0]:
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [0]:
train_seqs = tokenizer.texts_to_sequences(train_captions)

In [95]:
train_seqs

[[3, 2, 351, 687, 2, 280, 5, 2, 84, 339, 4],
 [3, 2, 31, 2356, 112, 7, 137, 5, 159, 4],
 [3, 2, 318, 284, 9, 28, 525, 1083, 305, 30, 4],
 [3, 2, 1118, 845, 1539, 57, 11, 1203, 219, 2, 91, 1008, 13, 2, 1290, 4],
 [3, 37, 152, 8, 2, 376, 155, 127, 214, 6, 2, 65, 39, 4],
 [3, 1823, 6, 17, 19, 109, 7, 90, 108, 4],
 [3, 2, 199, 11, 27, 2, 40, 10, 432, 556, 4],
 [3, 17, 479, 5, 111, 613, 8, 2, 2984, 471, 44, 4],
 [3, 16, 335, 42, 100, 202, 8, 2, 36, 4],
 [3, 2, 12, 290, 5, 2, 173, 115, 8, 2, 143, 4],
 [3, 2, 35, 6, 81, 19, 109, 2, 25, 9, 2127, 59, 100, 985, 2, 2630, 4],
 [3, 2, 58, 9, 2, 90, 10, 4705, 451, 13, 7, 136, 4],
 [3, 2, 23, 14, 5, 2, 104, 101, 290, 5, 2, 115, 4],
 [3, 2, 23, 9, 2, 98, 5, 85, 1009, 18, 7, 540, 4],
 [3, 7, 365, 11, 4706, 38, 3587, 199, 1010, 4],
 [3, 2, 37, 53, 11, 14, 5, 28, 1824, 4],
 [3, 16, 1617, 399, 22, 9, 332, 34, 1972, 10, 332, 34, 399, 22, 13, 241, 4],
 [3, 184, 327, 18, 15, 377, 1825, 8, 7, 1826, 4],
 [3, 2, 294, 12, 9, 2, 2985, 96, 8, 7, 3588, 6, 2, 562, 3

In [0]:
# Padd Text to be the same length
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

In [97]:
max_length = calc_max_length(cap_vector)

print(max_length)

49


In [0]:
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, cap_vector, test_size=0.2, random_state=0)

In [99]:
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

(24000, 24000, 6000, 6000)

## TF Dataset For training

In [0]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
vocab_size = top_k + 1
num_steps = len(img_name_train) // BATCH_SIZE

feature_shape = 2048
attention_features_shape = 64

In [0]:
# Load Numpy files

def map_func(img_name, cap):
  img_tensor = np.load(img_name.decode('utf-8')+'.npy')
  return img_tensor, cap

In [0]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))
dataset = dataset.map(lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), 
                      num_parallel_calls = tf.data.experimental.AUTOTUNE)

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [75]:
# The dataset incldues 
# 1) img-array: compressed by inception model & 
# 2) text-padded array)

for n, data in enumerate(dataset):
  print(data)
  if n == 0:
    break

(<tf.Tensor: shape=(64, 64, 2048), dtype=float32, numpy=
array([[[1.2805442 , 1.804351  , 0.26045445, ..., 0.        ,
         2.2486188 , 1.3179123 ],
        [0.21489905, 1.049051  , 0.74307495, ..., 0.        ,
         2.0143673 , 0.9203752 ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.7525834 , 0.4663775 ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.163355  ,
         0.        , 1.265678  ],
        [0.        , 0.        , 0.31871387, ..., 0.        ,
         0.        , 0.06302447],
        [0.        , 0.        , 2.2397766 , ..., 0.        ,
         0.        , 0.        ]],

       [[0.        , 0.17240909, 0.91065544, ..., 0.5616593 ,
         0.6227289 , 0.10870036],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.35035312, 0.4656505 ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.6071424 ],
        ...,
        [0.        , 0.        , 0.        , ..., 0

# Model Architecture

In [0]:
class Attention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(Attention, self).__init__()
    self.units = units
    self.w1 = tf.keras.layers.Dense(units)
    self.w2 = tf.keras.layers.Dense(units)
    self.v = tf.keras.layers.Dense(1)

  def call(self, inputs):
    features = inputs[0] # image_encoder_output : 8,8,2048 -> (B, 64, 2048)
    hidden = inputs[1]   # hidden from RNN : (B, H)
    hidden_with_time = tf.expand_dims(hidden, 1)

    # score == (b, 64, H)
    score = tf.nn.tanh(self.w1(features) + self.w2(hidden_with_time))

    # attention_weight == (b, 64, 1)
    # The Bahdunau Concatenate Rule, the hidden == 1
    attention_weight = tf.nn.softmax(self.v(score), axis=1)

    # context vector = B, 64, 2048
    context_vector = attention_weight * features
    # print('context_before sum', context_vector)

    # Context vecotr = B, 2048
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weight


In [0]:
class CNN_Encoder(tf.keras.Model):
  def __init__(self, embedding_dim):
    super(CNN_Encoder, self).__init__()

    self.fc = tf.keras.layers.Dense(embedding_dim)

  def call(self, inputs):
    x = self.fc(inputs)
    x = tf.nn.relu(x)
    return x

In [0]:
class RNN_Decoder(tf.keras.Model):
  def __init__(self, embedding_dim, units, vocab_size):
    super(RNN_Decoder, self).__init__()
    self.units = units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
    self.gru = tf.keras.layers.GRU(self.units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc1 = tf.keras.layers.Dense(self.units)
    self.fc2 = tf.keras.layers.Dense(vocab_size)

    self.attention = Attention(self.units)

  def call(self, inputs):
    x = inputs[0]        # shape: (b, length)
    features = inputs[1] # shape: (b, 64, 2048)
    hidden = inputs[2]   # shape: (b, units)
  
    context_vector, attention_weight = self.attention([features, hidden])

    # embedding shape: b, 1, embedding_dim
    x = self.embedding(x)

    # after concat: b, 1, (embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    output, state = self.gru(x)

    # x shape: b, max_length, hidden_size
    x = self.fc1(output)

    # x reshaped: (b * max_length), hidden_size
    x = tf.reshape(x, (-1, x.shape[2]))

    # x shape: (b * max_length), vocab_size
    x = self.fc2(x)

    return x, state, attention_weight

  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))

In [0]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [0]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask  = tf.cast(mask, dtype=loss_.dtype)
  loss_ = loss_ * mask

  return tf.reduce_mean(loss_)

In [0]:
checkpoint_path = './checkpoints/train'

ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [0]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
  start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
  ckpt.restor(ckpt_manager.latest_checkpoint)

In [0]:
loss_plot = []

@tf.function
def train_step(img_tensor, target):
  loss = 0

  hidden = decoder.reset_state(batch_size = target.shape[0])

  dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)

  with tf.GradientTape() as tape:
    features = encoder(img_tensor)

    for i in range(1, target.shape[1]):
      predictions, hidden, _ = decoder([dec_input, features, hidden])

      loss = loss + loss_function(target[:, i], predictions)

      dec_input =  tf.expand_dims(target[:, i], 1)

  total_loss = (loss / int(target.shape[1]))

  trainable_variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, trainable_variables)
  optimizer.apply_gradients(zip(gradients, trainable_variables))
  return loss, total_loss

In [0]:
EPOCHS = 40

for epoch in range(start_epoch, EPOCHS):
  start = time.time()
  total_loss = 0

  for (batch, (img_tensor, target)) in enumerate(dataset):
    batch_loss, t_loss = train_step(img_tensor, target)
    total_loss += t_loss

    if batch % 100 == 0:
      print ('Epoch {} Batch {} Loss {}'.format(epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))

  loss_plot.append(total_loss / num_steps)

  if epoch % 5 == 0:
    ckpt_manager.save()
  
  print('Epoch {} Loss {}'.format(epoch+1, total_loss/num_steps))

  print('Time Spend for 1 Epoch {}sec/n'.format(time.time() - start))
    

Epoch 1 Batch 0 Loss 2.0066852180325254
Epoch 1 Batch 100 Loss 1.094290830651108
Epoch 1 Batch 200 Loss 0.9839502451371174
Epoch 1 Batch 300 Loss 0.8716721826670121
Epoch 1 Loss 1.0264102220535278
Time Spend for 1 Epoch 393.9214856624603sec/n
Epoch 2 Batch 0 Loss 0.8252892007633131
Epoch 2 Batch 100 Loss 0.7602089862434231
Epoch 2 Batch 200 Loss 0.8157076154436383
Epoch 2 Batch 300 Loss 0.7445244302555006
Epoch 2 Loss 0.7885914444923401
Time Spend for 1 Epoch 369.1431996822357sec/n
Epoch 3 Batch 0 Loss 0.7935508416623486
Epoch 3 Batch 100 Loss 0.7205210315937899
Epoch 3 Batch 200 Loss 0.7463680967992666
Epoch 3 Batch 300 Loss 0.7163982002102599
Epoch 3 Loss 0.7204239368438721
Time Spend for 1 Epoch 367.4744780063629sec/n
Epoch 4 Batch 0 Loss 0.650492999018455
Epoch 4 Batch 100 Loss 0.7417800280512595
Epoch 4 Batch 200 Loss 0.6722691594337931
Epoch 4 Batch 300 Loss 0.6488368754484215
Epoch 4 Loss 0.6761036515235901
Time Spend for 1 Epoch 364.7594618797302sec/n
Epoch 5 Batch 0 Loss 0.654

In [0]:
def evaluate(image):
  attention_plot = np.zeros((max_length, attention_features_shape))

  hidden = decoder.reset_state(batch_size=1)

  temp_input = tf.expand_dims(load_image(image)[0], 0)  # 1, 299, 299, 3
  img_tensor_val = image_features_extract_model(temp_input) # 1, 64, 2048
  img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))

  features = encoder(img_tensor_val) # 1, 64

  dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)  # 1, 1
  
  result = []

  for i in range(max_length): # max_length = 49
    predictions, hidden, attention_weight = decoder(dec_input, features, hidden) 
    attetion_plot[i] = tf.reshape(attention_weight, (-1, )).numpy()

    predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
    result.appedn(tokenizer.index_word[predicted_id])

    if tokenizer.index_word[predicted_id] == '<end>':
      return result, attention_plot
    
    dec_input = tf.expand_dims([predicted_id], 0)

  attention_plot = attention_plot[:len(result), :]
  return result, attention_plot

In [0]:
def plot_attention(image, result, attention_plot):
  temp_image = np.array(Image.open(image))

  fig = plt.figure(figsize=(10, 10))

  len_result = len(result)

  for l in range(len_result):
    temp_att = np.resize(attention_plot[l], (8,8))
    ax = fig.add_subplot(len_result // 2 , len_result // 2, l + 1)
    ax.set_title(result[l])
    img = ax.imshow(temp_image)
    ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())
  plt.tight_layout()
  plt.show

In [0]:
# Caption from validation data

rid = np.random.randint(0, len(img_name_val))

image = img_name_val[rid]

real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])

result, attetion_plot = evaluate(image)

print('Real Caption', real_catption)

print('Predicted Caption', ' '.join(result))
plot_attention(image, result, attention_plot)