# Image captioning with visual attention
This notebook are a coping from tensorflow page for image captioning.
https://www.tensorflow.org/tutorials/text/image_captioning

The model architecture is similar to https://arxiv.org/abs/1502.03044

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle

__Enable Eager Execution__

In [None]:
tf.enable_eager_execution()

## Download and prepare the MS-COCO dataset
The dataset contains over 82,000 images, each of which has at least 5 different caption annotations. The code below downloads and extracts the dataset automatically.

### When using aws sage maker
I using the SageMaker from AWS to processing this notebook, and for it I install whit this command.

In [None]:
!pip install keras

In [2]:
from mscoco import MsCoco

dataset = MsCoco()

Using TensorFlow backend.


In [4]:
annotation_file, train_folder = dataset.download()

## Get images vector and annotations

In [6]:
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [7]:
all_captions = []
all_img_name_vector = []

In [9]:
for annotation in annotations['annotations']:
    caption = f'<start>{annotation["caption"]}<end>'
    image_id = annotation['image_id']
    coco_image_path = train_folder + 'COCO_train2014_' + '%012d.jpg' % (image_id)

    all_img_name_vector.append(coco_image_path)
    all_captions.append(caption)

In [10]:
train_captions, img_name_vector = shuffle(all_captions,
                                          all_img_name_vector)

In [11]:
len(train_captions), len(all_captions)

(414113, 414113)

## Preprocess the images using InceptionV3

First, you will convert the images into InceptionV3's expected format by: * Resizing the image to 299px by 299px *

In [12]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

## Initialize InceptionV3 and load the pretrained Imagenet weights

In [None]:
image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')

new_image = image_model.input
hidden_layers = image_model.layers[-1].output

In [14]:
image_features_extract_model = tf.keras.Model(new_image, hidden_layers)

## Caching the features extracted from InceptionV3

In [16]:
encode_train = sorted(set(img_name_vector))

In [17]:
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(32)

In [None]:
for image, path in image_dataset:
    batch_features = image_features_extract_model(image)
    batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))
    
    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())