In [44]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
from pickle import dump, load
import string
import os
from time import time
from PIL import Image
import glob
import pickle

import tensorflow as tf

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.optimizers import Adam, RMSprop
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.layers.wrappers import Bidirectional
from keras.applications.inception_v3 import preprocess_input
from keras.layers.merge import add
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras.utils import to_categorical

tf.config.run_functions_eagerly(True)

In [2]:
def load_doc(filename):
    
    file = open(filename, 'r')
    content = file.read()
    file.close()
    return content

In [16]:
def load_descriptions(doc):
    
    mapping = {}

    for line in doc:

        if len(line) < 2:
            continue
            
        if line == "":
            break

        tokens = line.split('|')            
            
        image_id = tokens[0]
        image_id = image_id.split('.')
        image_id = image_id[0]

        image_desc = tokens[2]

        if image_id not in mapping:
            mapping[image_id] = []
        
        mapping[image_id].append(image_desc)
        
    return mapping

filename = "../../data/flickr30k_images/results.csv"
doc = load_doc(filename)

descriptions = load_descriptions(doc.split('\n')[1:])
print('Loaded: %d ' % len(descriptions))

line check
Loaded: 31783 


In [17]:
list(descriptions.keys())[:5]

['1000092795', '10002456', '1000268201', '1000344755', '1000366164']

In [21]:
def preprocess_descriptions(descriptions):

    table = str.maketrans('', '', string.punctuation)
    
    for key, desc_list in descriptions.items():
        desc_len = len(desc_list)
        for i in range(desc_len):
            desc = desc_list[i].split()
            desc = [word.lower() for word in desc]
            desc = [w.translate(table) for w in desc]
            desc = [word for word in desc if len(word)>1]
            desc = [word for word in desc if word.isalpha()]
            desc_list[i] =  ' '.join(desc)

preprocess_descriptions(descriptions)

In [23]:
def to_vocabulary(descriptions):
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

vocabulary = to_vocabulary(descriptions)
print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 19735


In [24]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    
    lines = []
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            line = key + ' ' + desc
            lines.append(line)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

save_descriptions(descriptions, 'descriptions.txt')

In [4]:
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    doc=doc.split('\n')[1:]
    # process line by line
    for line in doc:
        # skip empty lines
        if len(line) < 2:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

# load training dataset (6K)
filename = '../../data/flickr30k_images/results.csv'
train = load_set(filename)
print('Dataset: %d' % len(train))


Dataset: 31783


In [5]:
# Below path contains all the images
images = '../../data/flickr30k_images/flickr30k_images/'
# Create a list of all image names in the directory
img = glob.glob(images + '*.jpg')

In [6]:

train_img = []

for i in img: 
    train_img.append(i) 

In [7]:
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        
        tokens = line.split()
        
        image_id, image_desc = tokens[0], tokens[1:]
        
        if image_id in dataset:
            
            if image_id not in descriptions:
                descriptions[image_id] = list()
            
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'

            descriptions[image_id].append(desc)
    return descriptions

train_descriptions = load_clean_descriptions('../../data/flickr30k_images/descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=31783


In [8]:
def preprocess(image_path):
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = image.load_img(image_path, target_size=(299, 299))
    # Convert PIL image to numpy array of 3-dimensions
    x = image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess the images using preprocess_input() from inception module
    x = preprocess_input(x)
    return x

In [9]:
# Load the inception v3 model
model = InceptionV3(weights='imagenet')

In [10]:
# Create a new model, by removing the last layer (output layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

In [11]:
# Function to encode a given image into a vector of size (2048, )
def encode(image):
    image = preprocess(image) # preprocess the image
    fea_vec = model_new.predict(image) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

In [12]:
# Call the funtion to encode all the train images
# This will take a while on CPU - Execute this only once
start = time()
encoding_train = {}
for i in tqdm(range(len(train_img))):
    encoding_train[train_img[i][len(images):]] = encode(train_img[i])
print("Time taken in seconds =", time()-start)

100%|██████████| 31783/31783 [1:47:01<00:00,  4.95it/s]  

Time taken in seconds = 6421.650390386581





In [15]:
# Save the bottleneck train features to disk
with open("../../data/flickr30k_images/encoded_train_images.pkl", "wb") as encoded_pickle:
    pickle.dump(encoding_train, encoded_pickle)

In [17]:
train_features = load(open("../../data/flickr30k_images/encoded_train_images.pkl", "rb"))
print('Photos: train=%d' % len(train_features))

Photos: train=31783


In [18]:
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

158915

In [19]:
# Consider only words which occur at least 10 times in the corpus
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

preprocessed words 19737 -> 5437


In [20]:
ixtoword = {}
wordtoix = {}

ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

In [21]:
vocab_size = len(ixtoword) + 1 # one for appended 0's
vocab_size

5438

In [22]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

Description Length: 74


In [24]:
# Load Glove vectors
glove_dir = '../../data/'
embeddings_index = {} # empty dictionary
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [25]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [26]:
embedding_matrix.shape

(5438, 200)

In [31]:
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)


In [32]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 74)]         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 74, 200)      1087600     input_7[0][0]                    
__________________________________________________________________________________________________
dropout_4 (Dropout)             (None, 2048)         0           input_6[0][0]                    
____________________________________________________________________________________________

In [33]:
model.layers[2]

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7fdeadbfafd0>

In [34]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [35]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [41]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if n==num_photos_per_batch:
                yield ((array(X1), array(X2)), array(y))
                X1, X2, y = list(), list(), list()
                n=0

In [42]:
epochs = 10
number_pics_per_bath = 3
steps = len(train_descriptions)//number_pics_per_bath

In [51]:
for i in range(epochs):
    loss=0
    generator = data_generator(train_descriptions, train_features, wordtoix, max_length, number_pics_per_bath)
    try:
        print(loss)
        loss+=history.history['loss']
    except:
        print(loss)
        loss+=0
    history=model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    print(loss)
    if(i%3==0):
        model.save('./model_weights/model_' + str(i) + '.h5')

0
0
   45/10594 [..............................] - ETA: 4:13:24 - loss: 5.6801

KeyboardInterrupt: 

In [None]:
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, wordtoix, max_length, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./model_weights/model_' + str(i) + '.h5')

In [None]:
model.optimizer.lr = 0.0001
epochs = 10
number_pics_per_bath = 6
steps = len(train_descriptions)//number_pics_per_bath

In [None]:
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, wordtoix, max_length, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    #model.save('./model_weights/model_' + str(i) + '.h5')

In [None]:
model.save_weights('./model_weights/model_30.h5')