In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
from numpy import array
from pickle import load, dump
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint

In [3]:
import sys
import os

# Helper functions to load previously processed image features, cleaned image descriptions, and other saved items
py_file_location = '/content/drive/MyDrive/ImageCaptioningProject/Notebooks/FinalizedNotebooks'
sys.path.append(os.path.abspath(py_file_location))

import HelperFunctions as helper

## GloVe embeddings

In [4]:
def getDicts(descriptions, word_count_threshold=None):
  all_captions = helper.toLines(descriptions)
  word_counts = {}
  for sent in all_captions:
      for w in sent.split(' '):
          word_counts[w] = word_counts.get(w, 0) + 1
  vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
  # Create two dictionaries: one to map word to an index, one to map index to a word
  # Also adding 1 to length of vocabulary, appending 0s at end of all captions to make them equal length
  ixtoword = {}
  wordtoix = {}
  ix = 1
  for w in vocab:
      wordtoix[w] = ix
      ixtoword[ix] = w
      ix += 1
  vocab_size = len(ixtoword) + 1
  return vocab_size, wordtoix, ixtoword

def getGloveEmbeddings(wordtoix, vocab_size):
  embeddings_index = {}
  # File location of GloVe embeddings
  glove_file = open('/content/drive/MyDrive/ImageCaptioningProject/glove.6B.200d.txt', encoding='utf-8')
  for line in glove_file:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
  glove_file.close()
  # Make the matrix of shape ({vocab length},200) consisting of our vocabulary and the 200-d vector.
  embedding_dim = 200
  embedding_matrix = np.zeros((vocab_size, embedding_dim))
  for word, i in wordtoix.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector
  return embedding_matrix

## Creating sequences

Takes the word-to-index dictionary, the index-to-word dictionary, the maximum length of the descriptions, dictionary of all descriptions, and dictionary of photo features and transforms data into input/output pairs of data for training the model.

In [5]:
# Create sequences of images, input sequences, and output words for an image
def createSequences(wordtoix, max_length, descriptions, photos_features, vocab_size):
  X1, X2, y = list(), list(), list()
  # Walk through each image id
  for key, desc_list in descriptions.items():
    # Walk through each description for the image
    for desc in desc_list:
      # Encode the sequence
      seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
      # split one sequence into multiple X,y pairs
      for i in range(1, len(seq)):
        # split into input and output pair - words up to i, and i (where i is the next word)
        in_seq, out_seq, = seq[:i], seq[i]
        # pad input sequence 
        in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
        # encode output sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # store
        X1.append(photos_features[key][0])
        X2.append(in_seq)
        y.append(out_seq)
  return array(X1), array(X2), array(y)

## Load training data 
Creates input/output pairs for the training data <br>
input: image features, text descriptions <br>
output: next word

In [6]:
descriptions_file = '/content/drive/My Drive/ImageCaptioningProject/descriptions.txt'
features_file = '/content/drive/My Drive/ImageCaptioningProject/features.pkl'

# Load training dataset
train_image_ids_file = '/content/drive/My Drive/ImageCaptioningProject/Flickr8k_text/Flickr_8k.trainImages.txt'
train_ids = helper.loadImageIds(train_image_ids_file)
train_descriptions = helper.loadCleanDescriptions(descriptions_file, train_ids)
train_features = helper.loadImageFeatures(features_file, train_ids)

# Limit to words that have been used at least n times
vocab_size, wordtoix, ixtoword = getDicts(train_descriptions, word_count_threshold=4)
# Save wordtoix and ixtoword to be used to generate new captions
dump(wordtoix, open('/content/drive/MyDrive/ImageCaptioningProject/FinalizedModels/ImageModels/wordtoix.pkl', 'wb'))
dump(ixtoword, open('/content/drive/MyDrive/ImageCaptioningProject/FinalizedModels/ImageModels/ixtoword.pkl', 'wb'))
print('Vocab size: %d' % vocab_size)

# Get GloVe embedding matrix of descriptions
glove_embeddings_matrix = getGloveEmbeddings(wordtoix=wordtoix, vocab_size=vocab_size)
# Determine the maximum sequence (i.e. words in description) length
max_length = helper.calcMaxLength(train_descriptions)
print('Max length: %d' % max_length)

# Prepare training sequences
X1train, X2train, ytrain = createSequences(wordtoix, max_length, train_descriptions, train_features, vocab_size)

Vocab size: 2915
Max length: 34


## Load validation data
Creates input/output pairs for the validation data

In [7]:
# load validation dataset
val_image_ids_file = '/content/drive/My Drive/ImageCaptioningProject/Flickr8k_text/Flickr_8k.devImages.txt'
val_ids = helper.loadImageIds(val_image_ids_file)
val_descriptions = helper.loadCleanDescriptions(descriptions_file, val_ids)
val_features = helper.loadImageFeatures(features_file, val_ids)

# prepare sequences
X1val, X2val, yval = createSequences(wordtoix, max_length, val_descriptions, val_features, vocab_size)

### Bahdanau Attention

In [8]:
from keras.layers import Layer
import keras.backend as K

class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

## Bulding model - Bahdanau attention on Images


In [9]:
# Defining the captioning model
def getModel(vocab_size, max_length):
  # feature extractor model
  inputs1 = Input(shape=(2048,1))
  fe1 = Dropout(0.5)(inputs1)
  pic_att_in = LSTM(256, input_shape=(2048,1), return_sequences=True)(fe1)
  pic_att_out = attention()(pic_att_in)
  # sequence model
  inputs2 = Input(shape=(max_length,))
  se1 = Embedding(vocab_size, 200, mask_zero=True, name='se1')(inputs2)
  se2 = Dropout(0.5)(se1)
  se3 = LSTM(256)(se2)
  # decoder model
  decoder1 = add([pic_att_out, se3])
  decoder2 = Dense(256, activation='relu')(decoder1)
  outputs = Dense(vocab_size, activation='softmax')(decoder2)
  # tie it together [image, seq] -> [word]
  model = Model(inputs=[inputs1, inputs2], outputs=outputs)

  # do not want to retrain the weights in text embedding layer (pre-trained Glove vectors)
  model.get_layer('se1').set_weights([glove_embeddings_matrix])
  model.get_layer('se1').trainable = False
  model.compile(loss='categorical_crossentropy', optimizer='adam')

  # summarize model
  print(model.summary())
  plot_model(model, to_file='/content/drive/MyDrive/ImageCaptioningProject/FinalizedModels/ImageModels/Images_model.png', show_shapes=True)
  return model

## Train model
Models with improved loss are saved each epoch

In [None]:
model = getModel(vocab_size, max_length)

# Adding checkpoint - save the model when it improves, 
# and then use the model with the best skill as the final model.
# https://www.tensorflow.org/tutorials/keras/save_and_load
# SavedModel_format rather than .h5, since saving a custom model

# Define checkpoint callback
filepath = '/content/drive/MyDrive/ImageCaptioningProject/FinalizedModels/ImageModels/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# fit model
history = model.fit([X1train, X2train], ytrain, epochs=10, verbose=2, callbacks=[checkpoint], validation_data=([X1val, X2val], yval))

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2048, 1)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 34)]         0                                            
__________________________________________________________________________________________________
dropout (Dropout)               (None, 2048, 1)      0           input_1[0][0]                    
__________________________________________________________________________________________________
se1 (Embedding)                 (None, 34, 200)      583000      input_2[0][0]                    
_______________________________________________________________________________________

In [None]:
# Save model history
dump(history.history, open('/content/drive/MyDrive/ImageCaptioningProject/FinalizedModels/ImageModels/history.pkl', 'rb'))