**Implementing image captioning model using flicker_8k dataset. 7k images for training and 1k images for test purpose. I am using VGG16 model as the part of the transfer learning.**

In [0]:
from os import listdir
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img # used to import image in PIL format
from keras.preprocessing.image import img_to_array # used to convert the PIL image to numpy array format
from keras.applications.vgg16 import preprocess_input #used to extract features from the image
from keras.models import Model

In [0]:
def extract_features(directory):
  #load the model
  model = VGG16()
  # re-structure the model
  model.layers.pop()
  model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
  #summarize
  # print(model.summary())

  #extract features from images
  features = dict()
  for name in listdir(directory):
    #load image from file
    filepath = directory + '/' + name
    image = load_img(filepath, target_size=(224, 224))
    #convert the image to numpy array
    image = img_to_array(image)
    #reshape the image
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    #prepare the image for VGG model
    image = preprocess_input(image)
    #get features
    feature = model.predict(image, verbose=0)
    image_id = name.split('.')[0]
    #sore the image
    features[image_id] = feature

  return features

# extract features from all th images
directory = 'flicker-dataset'
features = extract_features(directory)
print(len(features))
# dump(features, open('Image_Caption_Modelling/features.pkl', 'wb'))

1


In [0]:
import string

#load doc into memory
def load_doc(filename):
  #open the file as read only
  file = open(filename, 'r')
  #read all text
  text = file.read()
  #close the file
  file.close()
  return text

#extract descriptions from images
def load_descriptions(doc):
  mapping = dict()
  for line in doc.split('\n'):
    if len(line)< 0:
      continue
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    image_desc = 'starteq '+" ".join(image_desc)+' endeq'

    if image_id not in mapping:
      mapping[image_id] = list()
    
    #store description
    mapping[image_id].append(image_desc)

  return mapping


In [0]:
def to_vocabulary(descriptions):
  #build list of all description strings
  all_desc = set()
  for key in descriptions.keys():
    [all_desc.update(d.split()) for d in descriptions[key]]
  return all_desc

In [0]:
filename = '/content/Flickr8k.token.txt'

#load descriptions
doc = load_doc(filename)

#parse descriptions
descriptions = load_descriptions(doc)
print(descriptions)
vocabulary = to_vocabulary(descriptions)
print(len(vocabulary))

{'10815824_2997e03d76': ['starteq A blonde horse and a blonde girl in a black sweatshirt are staring at a fire in a barrel . endeq', 'starteq A girl and her horse stand by a fire . endeq', "starteq A girl holding a horse 's lead behind a fire . endeq", 'starteq A man , and girl and two horses are near a contained fire . endeq', 'starteq Two people and two horses watching a fire . endeq']}
33


In [0]:
l = list()

for key in descriptions.keys():
    [l.append(d) for d in descriptions[key]]

l

['starteq A blonde horse and a blonde girl in a black sweatshirt are staring at a fire in a barrel . endeq',
 'starteq A girl and her horse stand by a fire . endeq',
 "starteq A girl holding a horse 's lead behind a fire . endeq",
 'starteq A man , and girl and two horses are near a contained fire . endeq',
 'starteq Two people and two horses watching a fire . endeq']

In [0]:
#convert the dictionary of clean descriptions to the list of descriptions
def to_lines(descriptions):
  all_desc = list()
  for key in descriptions.keys():
    [all_desc.append(d) for d in descriptions[key]]
  return all_desc

In [0]:
#fit the tokenizer
from keras.preprocessing.text import Tokenizer
def create_tokenizer(descriptions):
  lines = to_lines(descriptions)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [0]:
print(to_lines(descriptions))

['starteq A blonde horse and a blonde girl in a black sweatshirt are staring at a fire in a barrel . endeq', 'starteq A girl and her horse stand by a fire . endeq', "starteq A girl holding a horse 's lead behind a fire . endeq", 'starteq A man , and girl and two horses are near a contained fire . endeq', 'starteq Two people and two horses watching a fire . endeq']


In [0]:
#calculate the length of the descriptions with most words
def max_length(descriptions):
  lines = to_lines(descriptions)
  return max(len(d.split()) for d in lines)

In [0]:
max_length(descriptions)

22

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from numpy import array

tokenizer = create_tokenizer(descriptions)
#create sequences is written to generate the data on which model will train itself. 
#This is mainly done when we have less memory
def create_sequences(tokenizer, max_length, desc_list, photo):
	X1, X2, y = list(), list(), list()
	# walk through each description for the image
	for desc in desc_list:
		# encode the sequence
		seq = tokenizer.texts_to_sequences([desc])[0]
		# split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# store
			X1.append(photo)
			X2.append(in_seq)
			y.append(out_seq)
	return array(X1), array(X2), array(y)

In [0]:
c = create_sequences(tokenizer, max_length(descriptions), to_lines(descriptions), features["10815824_2997e03d76"])
print(c)

(array([[[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]]], dtype=float32), array([[ 0,  0,  0, ...,  0,  0,  2],
       [ 0,  0,  0, ...,  0,  2,  1],
       [ 0,  0,  0, ...,  2,  1,  9],
       ...,
       [ 0,  0,  0, ...,  8, 12, 29],
       [ 0,  0,  0, ..., 12, 29,  1],
       [ 0,  0,  0, ..., 29,  1,  4]], dtype=int32), array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32))


In [0]:
tokenizer = create_tokenizer(descriptions)
desc_list = to_lines(descriptions)
print(desc_list[0])
l = tokenizer.texts_to_sequences([desc_list[0]])[0]
print(max_length)
in_seq, out_seq = l[:1], l[1]
print(in_seq)
vocabulary = to_vocabulary(descriptions)
vocab_size = len(vocabulary)
in_seq = pad_sequences([in_seq], maxlen=max_length(descriptions))[0]
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
print(in_seq)
print(out_seq)

starteq A blonde horse and a blonde girl in a black sweatshirt are staring at a fire in a barrel . endeq
<function max_length at 0x7fd3c5133b70>
[2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [0]:
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint

In [0]:
def model(vocab_size, max_length):
  # featue extract model
  inputs1 = Input(shape=(4096,))
  fe1 = Dropout(0.5)(inputs1)
  fe2 = Dense(256, activation='relu')(fe1)

  #sequence model
  inputs2 = Input(shape=(max_length,))
  se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
  se2 = Dropout(0.5)(se1)
  se3 = LSTM(256)(se2)

  #decorder model
  decorder1 = add([fe2, se3])
  decorder2 = Dense(256, activation="relu")(decorder1)
  outputs = Dense(vocab_size, activation="softmax")(decorder2)

  #tie the all
  model = Model(inputs=[inputs1, inputs2], outputs=outputs)
  model.compile(loss="categorical_crossentropy", optimizer="adam")

  #model summary
  print(model.summary)
  return model

In [0]:
descriptions, features["10815824_2997e03d76"]

({'10815824_2997e03d76': ['starteq A blonde horse and a blonde girl in a black sweatshirt are staring at a fire in a barrel . endeq',
   'starteq A girl and her horse stand by a fire . endeq',
   "starteq A girl holding a horse 's lead behind a fire . endeq",
   'starteq A man , and girl and two horses are near a contained fire . endeq',
   'starteq Two people and two horses watching a fire . endeq']},
 array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32))

In [0]:
def data_generator(descriptions, photos, tokenizer, max_length):
	# loop for ever over images
	while 1:
		for key, desc_list in descriptions.items():
			# retrieve the photo feature
			photo = photos[key][0]
			in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
			yield [[in_img, in_seq], out_word]

In [0]:
#train the model
# print(vocab_size)
# print(max_length(descriptions))
# model = model(vocab_size, max_length(descriptions))
# model.compile(loss='categorical_crossentropy',
#               optimizer="adam",
#               metrics=['accuracy'])
# model.fit(features, descriptions, epochs=10)

In [0]:
# train the model
model = model(vocab_size, max_length(descriptions))
# train the model, run epochs manually and save after each epoch
epochs = 20
steps = len(descriptions)
for i in range(epochs):
	# create the data generator
	generator = data_generator(descriptions, features, tokenizer, max_length(descriptions))
	# fit for one epoch
	model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
	# save model
	model.save('model_' + str(i) + '.h5')

<bound method Network.summary of <keras.engine.training.Model object at 0x7fd3c4f244a8>>
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [0]:
def word_for_id(number, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == number:
      return word
  return none

In [0]:
#generate description for image
def generate_desc(model, tokenizer, photo, max_length):
  in_text = 'starteq'
  for i in range(max_length):
    #integer encode input sequence
    sequence = tokenizer.texts_to_sequences([in_text])[0]
    #pad input
    sequence = pad_sequences([sequence], maxlen=max_length)
    #predict next word
    yhat = model.predict([photo, sequence], verbose=0)
    print(yhat)
    #convert probability to integer
    yhat = argmax(yhat)
    #map integer to word
    word = word_for_id(yhat, tokenizer)
    if word is None:
      break
    in_text += ' ' + word
    if word == 'endseq':
      break

  return in_text


In [0]:
from keras.models import load_model

model = load_model('model_18.h5')
# load and prepare the photograph
photo = extract_features('/content/test')
# generate description
description = generate_desc(model, tokenizer, photo, max_length(descriptions))

print(description)