In [None]:
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
download = drive.CreateFile({'id': '1bTCXbH56Mk35-qDwI62BRbQZb2O_-Pei'})
download.GetContentFile('Flickr8k_Dataset.zip')

In [None]:
download = drive.CreateFile({'id': '1DeH1zRdNK-f0urPoQuYyuJfEQEy29aMw'})
download.GetContentFile('Flickr8k_text.zip')

In [None]:
import shutil
shutil.unpack_archive("Flickr8k_text.zip", "./Flickr8k_text")
shutil.unpack_archive("Flickr8k_Dataset.zip")

In [None]:
#helps in searching through a given path for all the files in the directory, i.e. Returns a list containing the names of the file entries in the directory given by the path
from os import listdir 
#pickle is used to serialize and deserialize a python object structure, so any object in python can be pickled so that it can be saved on disk
# pickle.dump is used to store the object data to the file
from pickle import dump
# pickle.load is used to load the object data from the file
from pickle import load

from keras.applications.vgg16 import VGG16
# used for loading an image from file as an PIL image object
from keras.preprocessing.image import load_img
# To convert the PIL image instance to the numpy array so that our model can understand the image
from keras.preprocessing.image import img_to_array
# used to preprocess any given image in order to extract features of that image
from keras.applications.vgg16 import preprocess_input
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

#we instantiate a model to include the necessary layers
from keras.models import Model

# For text based operations like getting rid of punctuations from text strings etc.
import string

Using TensorFlow backend.


Restructure VGG-16 Model

In [None]:
def extract_features(directory):
  '''
  Extract features from each image in the directory
  '''

  # load the model
  model = VGG16()

  # re-structure the model
  # Removing the last layer from the loaded model
  # VGG16 model is used to classify the umages or predict the classification for an image
  # We are not interested in classifying the image, hence we are removing the last layer
  # We are rather interested in the internal representation of this image right before the classification is made
  model.layers.pop()
  model = Model(inputs = model.inputs, outputs = model.layers[-1].output)

  # summarize
  # shows the architecture of the entire VGG model but without having the last classification layer
  print(model.summary())

  # extract features from each photo
  features = dict()
  # Go through each single image in the Flickr_8K dataset
  for name in listdir(directory):
    # load an image from file
    filename = directory + '/' + name
    image = load_img(filename, target_size=(224, 224)) # we can reshape the image in preferred size
    # convert image pixels into numpy array
    image = img_to_array(image)
    # reshape data according to the requirements of the CNN model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the image for the VGG model
    image = preprocess_input(image)
    # get features
    feature = model.predict(image, verbose=0)
    # get image id
    image_id = name.split('.')[0]
    # store feature
    features[image_id] = feature
    print('>%s' % name)
  
  return features

# extract from all images
directory = '/content/Flickr8k_Dataset'
features = extract_features(directory)
print('Extracted features: %d' % len(features))

# save to file
dump(features, open('features.pkl', 'wb'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
>3546027589_253553252a.jpg
>3693961165_9d6c333d5b.jpg
>294353408_d459bdaa68.jpg
>3255620561_7644747791.jpg
>2300168895_a9b83e16fc.jpg
>1787222774_d5c68cce53.jpg
>2249480913_e1695e5c28.jpg
>2736902411_a0010f89ae.jpg
>2176874361_2b4149010b.jpg
>3512033861_a357bb58b6.jpg
>1801063894_60bce29e19.jpg
>3607969989_68cc411493.jpg
>108899015_bf36131a57.jpg
>2431723485_bc6b8e6418.jpg
>3372251830_baa3665928.jpg
>3718892835_a3e74a3417.jpg
>2073105823_6dacade004.jpg
>3362871440_6c0f27c480.jpg
>3574930742_9081bd2426.jpg
>3273969811_42e9fa8f63.jpg
>944374205_fd3e69bfca.jpg
>3286198467_8880be127e.jpg
>485245061_5a5de43e20.jpg
>3587596696_9c5964c94d.jpg
>1164131282_b30926f332.jpg
>3434452829_62cee280bc.jpg
>381239475_044cbffa2b.jpg
>2694426634_118566f7ab.jpg
>2378127945_8dc9da82d7.jpg
>2858439751_daa3a30ab8.jpg
>1072153132_53d2bb1b60.jpg
>2497608431_8dfefc7a1a.jpg
>2616284322_b13e7c344e.jpg
>3640329164_20cb245fd5.jpg
>2543247940_083f1b7969

Convert image descriptions into vocabulary

In [None]:
# Read the flickr8K.token.txt which contains descriptions for all the images
# Each of the image file name contains a unique identifier which is linked with corresponding descriptions in the Flickr8k.token

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# Extract descriptions for images
# If provided with document text, will return a dictionary of image identifiers and corresponding descriptions
# Each of these image identifiers maps to a list of one or more text based descriptions
# This function takes a document as an argument
def load_descriptions(doc):
    mapping = dict()
    # process each sentence from the document by splitting them on the basis of '\n'
    for line in doc.split('\n'):
        # each sentence is split on the basis of white spaces into individual words or tokens
        tokens = line.split()
        # We will take the first element as the image identifier and the rest as the image description
        # If no description for image, then skip
        if (len(line) < 2):
            continue
        # storing the image descriptions for corresponding image ids
        image_id, image_desc = tokens[0], tokens[1:]
        # remove file extension from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # storing the image descriptions for the corresponding image ids in dictionary 'mapping'
        # create the list of needed
        if (image_id not in mapping):
            mapping[image_id] = list()
        mapping[image_id].append(image_desc)
        
    return mapping

# Cleaning the text descriptions- to be consistent in terms of the text provied 
# Normalizing the case of all tokens to lowercase. (the, The- same meaning)
# Remove all punctuation from tokens.
# Removing all tokens that contain one or fewer characters (after punctuation is removed), e.g. ‘a’ and hanging ‘s’ characters.
# Passing the dictionary of image identifiers and corresponding descriptions
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'- removing single character words
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] =  ' '.join(desc)

# convert the loaded descriptions into a vocabulary of words
# transform descriptions into a set so that we can get an idea of the size of our dataset's vocabulary
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    # for each key we pick the corresponding description 
    # such that all the words in that description are first split on the basis of white spaces (d.split)
    # then those words are added to set
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

# save descriptions to file, one per line
# Input- dictionary mapping image identifiers to the corresponding cleaned descriptions, file in which we will be saving the mapping
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

# filename = 'Flickr8k_text/Flickr8k.token.txt'
filename = 'Flickr8k_text/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
# clean descriptions
clean_descriptions(descriptions)
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))
# save to file
save_descriptions(descriptions, 'descriptions.txt')

Loaded: 8092 
Vocabulary Size: 8763


Load image features and descriptions

In [None]:
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier- remove extension
        identifier = line.split('.')[0]
        dataset.append(identifier)
    #Remove duplicates and return
    return set(dataset)

# load clean descriptions into memory
# Returns dictionary of image identifiers and corresponding descriptions
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            # signfies the start and end of image captioning
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

# load photo features
def load_photo_features(filename, dataset):
    # load all features
    all_features = load(open(filename, 'rb'))
    # filter features
    features = {k: all_features[k] for k in dataset}
    return features

# load training dataset (6K)
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))
train_descriptions

Dataset: 6000
Descriptions: train=6000
Photos: train=6000


{'1000268201_693b08cb0e': ['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
  'startseq girl going into wooden building endseq',
  'startseq little girl climbing into wooden playhouse endseq',
  'startseq little girl climbing the stairs to her playhouse endseq',
  'startseq little girl in pink dress going into wooden cabin endseq'],
 '1001773457_577c3a7d70': ['startseq black dog and spotted dog are fighting endseq',
  'startseq black dog and tricolored dog playing with each other on the road endseq',
  'startseq black dog and white dog with brown spots are staring at each other in the street endseq',
  'startseq two dogs of different breeds looking at each other on the road endseq',
  'startseq two dogs on pavement moving toward each other endseq'],
 '1002674143_1b742ab4b8': ['startseq little girl covered in paint sits in front of painted rainbow with her hands in bowl endseq',
  'startseq little girl is sitting in front of large painted rainbow endse

Tokenize descriptions and map to numeric values

In [None]:
from numpy import array
import tensorflow
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.models import load_model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint

# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# fit a tokenizer given caption descriptions
# Encode the text descriptions into numbers so that our model can understand this data
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# get vocabulary size
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo):
    # Input- image features, encoded text; Output- next encoded word in the text sequence
    # input text is encoded into numbers and which will be fed to the word embedding layer
    # output data will be one hot encoded version of each word 
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(y)

# Merge VGG and LSTM models
# define the captioning model
# Contains embedding, dropout, dense and LSTM layers for handling
def define_model(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)   #Regularization, reduces overfitting
    fe2 = Dense(256, activation='relu')(fe1)
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2) # msask_zero to ignore the padded values
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2) # 256 memory uits
    # decoder model
    decoder1 = add([fe2, se3])  # merge the two models
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summarize model
    print(model.summary())
    return model

Vocabulary Size: 7579


In [None]:
#Below code is used to progressively load the batch of data
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length):
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
            yield [[in_img, in_seq], out_word]

In [None]:
# load training dataset (6K)
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.pkl', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
seq_max_length = max_length(train_descriptions)
print('Description Length: %d' % seq_max_length)

Dataset: 6000
Descriptions: train=6000
Photos: train=6000
Vocabulary Size: 7579
Description Length: 34


In [None]:
#os.makedirs('models/') 
model = define_model(vocab_size, max_length)
# train the model, run epochs manually and save after each epoch
epochs = 20
steps = len(train_descriptions)
for i in range(epochs):
    # create the data generator
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    # fit for one epoch
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=2)
    # save model
    model.save('models/model_' + str(i) + '.h5')

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 34, 256)      1940224     input_7[0][0]                    
__________________________________________________________________________________________________
dropout_5 (Dropout)             (None, 4096)         0           input_6[0][0]                    
____________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1
 - 503s - loss: 4.6674
Epoch 1/1
 - 519s - loss: 3.8904
Epoch 1/1
 - 516s - loss: 3.6418
Epoch 1/1
 - 518s - loss: 3.4613
Epoch 1/1
 - 527s - loss: 3.3545
Epoch 1/1
 - 515s - loss: 3.2855
Epoch 1/1
 - 517s - loss: 3.2293
Epoch 1/1
 - 520s - loss: 3.1828
Epoch 1/1
 - 524s - loss: 3.1513
Epoch 1/1
 - 520s - loss: 3.1207
Epoch 1/1
 - 525s - loss: 3.0967
Epoch 1/1
 - 519s - loss: 3.0912
Epoch 1/1
 - 525s - loss: 3.0638
Epoch 1/1
 - 535s - loss: 3.0474
Epoch 1/1
 - 535s - loss: 3.0363
Epoch 1/1
 - 509s - loss: 3.0276
Epoch 1/1
 - 500s - loss: 3.0218
Epoch 1/1
 - 501s - loss: 3.0137
Epoch 1/1
 - 500s - loss: 3.0141
Epoch 1/1
 - 504s - loss: 3.0038


In [None]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text


In [None]:
# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, seq_max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], seq_max_length)
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
from keras.models import load_model
from numpy import argmax
from nltk.translate.bleu_score import corpus_bleu
# prepare training set

# load training dataset (6K)
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
seq_max_length = max_length(train_descriptions)
print('Description Length: %d' % seq_max_length)

# prepare test set

# load test set
filename = 'Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))

# load the model which has minimum loss, in this case it was model_18
filename = 'models/model_19.h5'
model = load_model(filename)
# evaluate model
evaluate_model(model, test_descriptions, test_features, tokenizer, seq_max_length)

Dataset: 6000
Descriptions: train=6000
Vocabulary Size: 7579
Description Length: 34
Dataset: 1000
Descriptions: test=1000
Photos: test=1000


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


BLEU-1: 0.509170
BLEU-2: 0.268791
BLEU-3: 0.183181
BLEU-4: 0.083391


In [None]:
#Generate Captions for a Fresh Image

from pickle import load
from numpy import argmax
from keras.preprocessing.sequence import pad_sequences
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.models import load_model

# extract features from each photo in the directory
def extract_features(filename):
    # load the model
    model = VGG16()
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    # load the photo
    image = load_img(filename, target_size=(224, 224))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    # reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the image for the VGG model
    image = preprocess_input(image)
    # get features
    feature = model.predict(image, verbose=0)
    return feature

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
# pre-define the max sequence length (from training)
max_length = 34
# load the model
model = load_model('models/model_19.h5')
# load and prepare the photograph
photo = extract_features('Sample_Image.jpg')
# generate description
description = generate_desc(model, tokenizer, photo, max_length)
print(description)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


startseq man in red shirt is riding bike on the street endseq


In [None]:
#Remove startseq and endseq
query = description
stopwords = ['startseq','endseq']
querywords = query.split()

resultwords  = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)

print(result)

man in red shirt is riding bike on the street


In [None]:
import shutil
shutil.make_archive('models', 'zip', 'models')

'/content/models.zip'

In [None]:
!pip install httplib2==0.15.0
upload = drive.CreateFile({'title': 'models.zip'})
upload.SetContentFile('/content/models.zip')
upload.Upload()

