In [1]:
import tensorflow as tf
import tensorflow.python.platform
import pickle as pkl
import os
import math

import numpy as np
import pandas as pd
import pickle
import cv2
#import skimage
from keras.preprocessing import sequence
from collections import Counter

Using TensorFlow backend.


In [2]:
model_path = './models/tensorflow'
model_path_transfer = './models/tf_final'
#Features of VGG 16
feature_path = './data/feats.npy'
#Captions for images
annotation_path = './data/results_20130124.token'

In [3]:
def data_input(annotation_path, feature_path):
     annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])
     features=np.load(feature_path,'r')
     capt=annotations['caption'].values
     return features,capt 

In [4]:
feats, captions = data_input(annotation_path,feature_path)

In [5]:
print(feats.shape)
print(captions.shape)

(158915, 4096)
(158915,)


In [6]:
print(captions[0])

Two young guys with shaggy hair look at their hands while hanging out in the yard .


In [7]:
def preprocess_data(sentence, max_word_count=30): # function from Andre Karpathy's NeuralTalk
    print('preprocessing %d word vocab' % (max_word_count, ))
    no_of_words = {}
    nsents = 0
    for s in sentence:
      nsents += 1
      for w in s.lower().split(' '):
        no_of_words[w] = no_of_words.get(w, 0) + 1
    vocab = [w for w in no_of_words if no_of_words[w] >= max_word_count]
    print('preprocessed words %d -> %d' % (len(no_of_words), len(vocab)))

    ixtoword = {}
    ixtoword[0] = '.'  
    wordtoix = {}
    wordtoix['#START#'] = 0 
    ix = 1
    for w in vocab:
      wordtoix[w] = ix
      ixtoword[ix] = w
      ix += 1

    no_of_words['.'] = nsents
    bias_init_vector = np.array([1.0*no_of_words[ixtoword[i]] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) 
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) 
    return wordtoix, ixtoword, bias_init_vector.astype(np.float32)


In [8]:
class Caption_Generator():
    def __init__(self, input_dimension, dim_embed, hidden_dimension, batch_size, num_lstm_steps, n_words, init_b):

        self.input_dimension = input_dimension
        self.dim_embed = dim_embed
        self.hidden_dimension = hidden_dimension
        self.batch_size = batch_size
        self.num_lstm_steps = num_lstm_steps
        self.n_words = n_words
        
        # declare the variables to be used for our word embeddings
        with tf.device("/cpu:0"):
            self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1))

        self.embedding_bias = tf.Variable(tf.zeros([dim_embed]))
        
        # declare the LSTM itself
        self.lstm = tf.contrib.rnn.BasicLSTMCell(hidden_dimension)
        
        # declare the variables to be used to embed the image feature embedding to the word embedding space
        self.embedded_img = tf.Variable(tf.random_uniform([input_dimension, hidden_dimension], -0.1, 0.1))
        self.embedded_img_bias = tf.Variable(tf.zeros([hidden_dimension]))

        # declare the variables to go from an LSTM output to a word encoding output
        self.encoded_word = tf.Variable(tf.random_uniform([hidden_dimension, n_words], -0.1, 0.1))
        # initialize this bias variable from the preprocess_data output
        self.encoded_word_bias = tf.Variable(init_b)

    def build_model(self):
        # declaring the placeholders for our extracted image feature vectors, our caption, and our mask
        # (describes how long our caption is with an array of 0/1 values of length `maxlen`  
        img = tf.placeholder(tf.float32, [self.batch_size, self.input_dimension])
        caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.num_lstm_steps])
        mask = tf.placeholder(tf.float32, [self.batch_size, self.num_lstm_steps])
        
        # getting an initial LSTM embedding from our image_imbedding
        image_embedding = tf.matmul(img, self.embedded_img) + self.embedded_img_bias
        
        # setting initial state of our LSTM
        state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)

        total_loss = 0.0
        with tf.variable_scope("RNN"):
            for i in range(self.num_lstm_steps): 
                if i > 0:
                   #if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding
                   # to the (i-1)th word in our caption 
                    with tf.device("/cpu:0"):
                        current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias
                else:
                     #if this is the first iteration of our LSTM we utilize the embedded image as our input 
                    current_embedding = image_embedding
                if i > 0: 
                    # allows us to reuse the LSTM tensor variable on each iteration
                    tf.get_variable_scope().reuse_variables()

                out, state = self.lstm(current_embedding, state)

                
                if i > 0:
                    #get the one-hot representation of the next word in our caption 
                    labels = tf.expand_dims(caption_placeholder[:, i], 1)
                    ix_range=tf.range(0, self.batch_size, 1)
                    ixs = tf.expand_dims(ix_range, 1)
                    concat = tf.concat([ixs, labels],1)
                    onehot = tf.sparse_to_dense(
                            concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)


                    #perform a softmax classification to generate the next word in the caption
                    logit = tf.matmul(out, self.encoded_word) + self.encoded_word_bias
                    loss_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)
                    loss_cross_entropy = loss_cross_entropy * mask[:,i]

                    loss = tf.reduce_sum(loss_cross_entropy)
                    total_loss += loss

            total_loss = total_loss / tf.reduce_sum(mask[:,1:])
            return total_loss, img,  caption_placeholder, mask


In [None]:

### Parameters ###
dim_embed = 256
hidden_dimension = 256
input_dimension = 4096
batch_size = 128
momentum = 0.9
n_epochs = 150

def train(learning_rate=0.001, continue_training=False, transfer=True):
    
    tf.reset_default_graph()

    feats, captions = data_input(annotation_path, feature_path)
    wordtoix, ixtoword, init_b = preprocess_data(captions)

    np.save('data/ixtoword', ixtoword)

    index = (np.arange(len(feats)).astype(int))
    np.random.shuffle(index)


    sess = tf.InteractiveSession()
    n_words = len(wordtoix)
    maxlen = np.max( [x for x in map(lambda x: len(x.split(' ')), captions) ] )
    caption_generator = Caption_Generator(input_dimension, hidden_dimension, dim_embed, batch_size, maxlen+2, n_words, init_b)

    loss, image, sentence, mask = caption_generator.build_model()

    saver = tf.train.Saver(max_to_keep=100)
    global_step=tf.Variable(0,trainable=False)
    learning_rate = tf.train.exponential_decay(learning_rate, global_step,
                                       int(len(index)/batch_size), 0.95)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    tf.global_variables_initializer().run()

    if continue_training:
        if not transfer:
            saver.restore(sess,tf.train.latest_checkpoint(model_path))
        else:
            saver.restore(sess,tf.train.latest_checkpoint(model_path_transfer))
    losses=[]
    for epoch in range(n_epochs):
        for start, end in zip( range(0, len(index), batch_size), range(batch_size, len(index), batch_size)):

            current_feats = feats[index[start:end]]
            current_captions = captions[index[start:end]]
            current_caption_ind = [x for x in map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ')[:-1] if word in wordtoix], current_captions)]

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1)
            current_caption_matrix = np.hstack( [np.full( (len(current_caption_matrix),1), 0), current_caption_matrix] )

            current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array([x for x in map(lambda x: (x != 0).sum()+2, current_caption_matrix )])

            for ind, row in enumerate(current_mask_matrix):
                row[:nonzeros[ind]] = 1

            _, loss_value = sess.run([train_op, loss], feed_dict={
                image: current_feats.astype(np.float32),
                sentence : current_caption_matrix.astype(np.int32),
                mask : current_mask_matrix.astype(np.float32)
                })

            print("Current Cost: ", loss_value, "\t Epoch {}/{}".format(epoch, n_epochs), "\t Iter {}/{}".format(start,len(feats)))
        print("Saving the model from epoch: ", epoch)
        saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)


In [None]:
try:
    train(.001,False,False) #train from scratch
except KeyboardInterrupt:
    print('Exiting Training')

preprocessing 30 word vocab
preprocessed words 20326 -> 2942
Current Cost:  5.46115 	 Epoch 0/150 	 Iter 0/158915
Current Cost:  5.46305 	 Epoch 0/150 	 Iter 128/158915
Current Cost:  5.30741 	 Epoch 0/150 	 Iter 256/158915
Current Cost:  5.32423 	 Epoch 0/150 	 Iter 384/158915
Current Cost:  5.35012 	 Epoch 0/150 	 Iter 512/158915
Current Cost:  5.3583 	 Epoch 0/150 	 Iter 640/158915
Current Cost:  5.25105 	 Epoch 0/150 	 Iter 768/158915
Current Cost:  5.29672 	 Epoch 0/150 	 Iter 896/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 1024/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 1152/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 1280/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 1408/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 1536/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 1664/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 1792/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 1920/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 2048/158915
Current Cost:  nan 	 Epoch 0/150 	

Current Cost:  nan 	 Epoch 0/150 	 Iter 19840/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 19968/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 20096/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 20224/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 20352/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 20480/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 20608/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 20736/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 20864/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 20992/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 21120/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 21248/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 21376/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 21504/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 21632/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 21760/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 21888/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 22016/158915
Current Cost:  nan 	 Epoch 0/150 	 Iter 22144/