https://medium.com/analytics-vidhya/neural-machine-translation-using-bahdanau-attention-mechanism-d496c9be30c3

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from numpy import array
from pickle import load
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint

## File Loading Helper Functions
Helps load previously processed image features and cleaned image descriptions.

In [None]:
# load doc into memory
def load_doc(filename):
  # open the file as read only
  file = open(filename, 'r')
  # read all text
  text = file.read()
  # close the file
  file.close()
  return text

# load list of unique photo ids, derived from image file names
def load_set(filename):
  doc = load_doc(filename)
  dataset = list()
  # process line by line
  for line in doc.split('\n'):
    # skip empty lines
    if len(line) < 1:
      continue
    # get the image identifier
    identifier = line.split('.')[0]
    dataset.append(identifier)
  return set(dataset)

# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
      # split line by white space
      tokens = line.split()
      # split id from description
      image_id, image_desc = tokens[0], tokens[1:]
      # skip images not in the set
      if image_id in dataset:
        # create list
        if image_id not in descriptions:
          descriptions[image_id] = list()
        # wrap description in tokens
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        # store
        descriptions[image_id].append(desc)
    return descriptions

# load photo features
def load_photo_features(filename, dataset):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features

## Tokenizer and Tokenizer helper functions
Tokenizer encodes English captions to vectors,
and transforms those vectors into uniform-length sequences. Tokenizer is fitted upon the training descriptions text. 

In [None]:
# Build tokenizer
# Note: add limit to vocabulay?  ~9500 to 5000

# convert a dictionary of clean descriptions (image_id: list of descriptions) to a general list of all descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc
 
#vocab_limit = 5000
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	#tokenizer = Tokenizer(num_words=vocab_limit)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# Helper function; calculate the length of the description with the most words
def calc_max_length(description):
  lines = to_lines(description)
  return max(len(d.split()) for d in lines)

## Creating sequences

Takes the tokenizer, the maximum length of the descriptions, dictionary of all descriptions, and dictionary of photo features and transforms data into input/output pairs of data for training the model.

In [None]:
# create sequences of images, input sequences, and output words for an image
def create_sequences(tokenizer, max_length, descriptions, photos_features, vocab_size):
  X1, X2, y = list(), list(), list()
  # walk through each image id
  for key, desc_list in descriptions.items():
    # walk through each description for the image
    for desc in desc_list:
      # encode the sequence
      seq = tokenizer.texts_to_sequences([desc])[0]
      # split one sequence into multiple X,y pairs
      for i in range(1, len(seq)):
        # split into input and output pair - words up to i, and i (where i is the next word)
        in_seq, out_seq, = seq[:i], seq[i]
        # pad input sequence 
        in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
        # encode output sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # store
        X1.append(photos_features[key][0])
        X2.append(in_seq)
        y.append(out_seq)
  return array(X1), array(X2), array(y)

In [None]:
import tensorflow as tf

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self, query, values): # hidden, enc_output
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        #return context_vector, attention_weights
        return context_vector

## Bulding model


In [None]:
# Defining the captioning model with Bahdanau Attention

import tensorflow as tf

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self, query, values): # hidden, enc_output
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        #return context_vector, attention_weights
        return context_vector

def define_model(vocab_size, max_length):
	# feature extractor model
	inputs1 = Input(shape=(2048,))
	fe1 = Dropout(0.5)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)
	# sequence model
	inputs2 = Input(shape=(max_length,))
	se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
	se2 = Dropout(0.5)(se1)
	rnn = LSTM(256, return_sequences=True, return_state=True)
	enc_output, hidden_state = (rnn)(se2)
	attention = BahdanauAttention(2048)
	se3 = attention(hidden_state, enc_output)
	# decoder model
	decoder1 = add([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)
	# tie it together [image, seq] [word]
	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
	model.compile(loss='categorical_crossentropy', optimizer='adam')
	# summarize model
	print(model.summary())
	#plot_model(model, to_file='model.png', show_shapes=True)
	return model

In [None]:
# https://github.com/keras-team/keras/issues/4962

from keras.layers import Flatten, Activation, RepeatVector, Permute

# Defining the captioning model
def define_model(vocab_size, max_length):
  # feature extractor model
  inputs1 = Input(shape=(2048,))
  fe1 = Dropout(0.5)(inputs1)
  fe2 = Dense(256, activation='relu')(fe1)
  # sequence model
  inputs2 = Input(shape=(max_length,))
  se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
  se2 = Dropout(0.5)(se1)
  activations = LSTM(256, return_sequences=True)(se2)
  attention = Dense(1, activation='tanh')(activations)
  attention = Flatten()(attention)
  attention = Activation('softmax')(attention)
  attention = RepeatVector(256)(attention)
  attention = Permute([2, 1])(attention)
  se3 = LSTM(256)(attention)
  # decoder model
  decoder1 = add([fe2, se3])
  decoder2 = Dense(256, activation='relu')(decoder1)
  outputs = Dense(vocab_size, activation='softmax')(decoder2)
  # tie it together [image, seq] [word]
  model = Model(inputs=[inputs1, inputs2], outputs=outputs)
  model.compile(loss='categorical_crossentropy', optimizer='adam')
  # summarize model
  print(model.summary())
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [None]:
model = define_model(vocab_size, max_length)

Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           [(None, 34)]         0                                            
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 34, 256)      1940224     input_28[0][0]                   
__________________________________________________________________________________________________
dropout_27 (Dropout)            (None, 34, 256)      0           embedding_13[0][0]               
__________________________________________________________________________________________________
lstm_13 (LSTM)                  (None, 34, 256)      525312      dropout_27[0][0]                 
_______________________________________________________________________________________

## Load training data 
Creates input/output pairs for the training data <br>
input: image features, text descriptions <br>
output: next word

In [None]:
# load training dataset
filename = '/content/drive/My Drive/ImageCaptioningProject/Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
# descriptions
train_descriptions = load_clean_descriptions('/content/drive/My Drive/ImageCaptioningProject/descriptions.txt', train)
# photo features
train_features = load_photo_features('/content/drive/My Drive/ImageCaptioningProject/features.pkl', train)
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)

# Embedding layer expects input_dim to be vocab size + 1
vocab_size = len(tokenizer.word_index) + 1
# determine the maximum sequence length
max_length = calc_max_length(train_descriptions)
# prepare sequences
X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features, vocab_size)


## Load validation data
Creates input/output pairs for the validation data

In [None]:
# load validation set
filename = '/content/drive/My Drive/ImageCaptioningProject/Flickr8k_text/Flickr_8k.devImages.txt'
validation = load_set(filename)
# descriptions
validation_descriptions = load_clean_descriptions('/content/drive/My Drive/ImageCaptioningProject/descriptions.txt', validation)
# photo features
validation_features = load_photo_features('/content/drive/My Drive/ImageCaptioningProject/features.pkl', validation)

# prepare sequences
X1val, X2val, yval = create_sequences(tokenizer, max_length, validation_descriptions, validation_features, vocab_size)

## Train model
Models with improved loss are saved each epoch

In [None]:
# Fit model

# define the model
model = define_model(vocab_size, max_length)

# Adding checkpoint - save the model when it improves, 
# and then use the model with the best skill as the final model.
# https://www.tensorflow.org/tutorials/keras/save_and_load
# SavedModel_format rather than .h5, since saving a custom model

# Define checkpoint callback
filepath = '/content/drive/MyDrive/ImageCaptioningProject/NewModels/TutorialModels/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# fit model
model.fit([X1train, X2train], ytrain, epochs=5, verbose=1, callbacks=[checkpoint], validation_data=([X1val, X2val], yval))

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 34)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 34, 256)      1940224     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 34, 256)      0           embedding[0][0]                  
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 34, 256)      525312      dropout_1[0][0]                  
_______________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7feed11384a8>

## Evaluate model

In [None]:
from nltk.translate.bleu_score import corpus_bleu

# map an integer to word
def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == integer:
      return word
  return none

# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
  # seed generation process with start flag
  in_text = 'startseq'
  # iterate over the whole length of the sequence
  for i in range(max_length):
    # integer encode input sequence 
    sequence = tokenizer.texts_to_sequences([in_text])[0]
    # pad input
    sequence = pad_sequences([sequence], maxlen=max_length)
    # predict next word
    yhat = model.predict([photo, sequence], verbose=0)
    # convert probability to an integer
    yhat = argmax(yhat)
    # map integer to word
    word = word_for_id(yhat, tokenizer)
    # stop if we cannot map the word
    if word is None:
      break
    # append as input for generating the next word
    in_text += ' ' + word
    # stop if we predict the end of the sequence 
    if word == 'endseq':
      break
  return in_text

# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
  actual, predicted = list(), list()
  # step over the whole set
  for key, desc_list in descriptions.items():
    # generate descriptions
    yhat = generate_desc(model, tokenizer, photos[key], max_length)
    # store actual and predicted
    references = [d.split() for d in desc_list]
    actual.append(references)
    predicted.append(yhat.split())
  # calculate BLEU scores
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
from keras.models import load_model
from numpy import argmax

# load tokenizer
tokenizer = load(open('/content/drive/My Drive/ImageCaptioningProject/develop_tokenizer.pkl', 'rb'))
# previously defined
max_length = 34

# load test set, unseen by the model
filename = '/content/drive/My Drive/ImageCaptioningProject/Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(filename)
# descriptions
test_descriptions = load_clean_descriptions('/content/drive/My Drive/ImageCaptioningProject/descriptions.txt', test)
# photo features
test_features = load_photo_features('/content/drive/My Drive/ImageCaptioningProject/features.pkl', test)

# load the model
filename = '/content/drive/MyDrive/ImageCaptioningProject/NewModels/TransformerModelsWithLR/monday_model'
saved_model = load_model(filename)

# evaluate model
evaluate_model(saved_model, test_descriptions, test_features, tokenizer, max_length)

In [None]:
from tensorflow.keras.applications import InceptionV3
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.inception_v3 import preprocess_input

# extract features of new uncaptioned photos
def extract_features(filename):
  # load model
  features_model = InceptionV3()
  # omitting unneccessary classification layer
  features_model = Model(inputs=features_model.inputs, outputs=features_model.layers[-2].output)
  # load photo
  image = load_img(filename, target_size=(299, 299))
  # convert image pixels to numpy array
  image = img_to_array(image)
  # reshape image array for model input
  image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) 
  # prepare image for CNN. Normalizes image array to range [-1, 1], matching format of images used to train InceptionV3
  image = preprocess_input(image)
  # get image features
  feature = features_model.predict(image, verbose=0)
  return feature

In [None]:
from keras.models import load_model
from numpy import argmax
# BEST MODEL SO FAR: /content/drive/MyDrive/ImageCaptioningProject/NewModels/TutorialModels/model-ep004-loss3.884-val_loss4.150
saved_model = load_model('/content/drive/MyDrive/ImageCaptioningProject/NewModels/TutorialModels/model-ep004-loss3.884-val_loss4.150')

photo = extract_features('/content/drive/MyDrive/ImageCaptioningProject/uncaptioned_images/example4.jpg')
description = generate_desc(saved_model, tokenizer, photo, max_length)
print(description)

startseq young boy in blue shirt is playing in the water endseq
