# Image Caption Generation using VGG
MD Muhaimin Rahman
contact: sezan92[at]gmail[dot]com

In this project, I have tried to work on Caption generation of Images of Flickr_8k dataset. I took extensive help from Jason Brownlee's Blog [article](https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/) on the same dataset. But I thought some codeblocks were unnecessarily complex . So I changed them for my project. The main architecture is mainly taken from Googles [paper](https://arxiv.org/abs/1411.4555),

### Data Organization

To make sense of this code, we need to get an idea about how the dataset is organized. I have used Flickr8k dataset, which I cannot redistribute. You have to fillup this [form](https://forms.illinois.edu/sec/1713398) and they will give you the dataset. You have to keep the folders ```Flicker8k_Dataset``` and ```Flickr_Text``` inside the ```dataset``` folder. 

The ```Flicker8k_Dataset``` has all the images - train,test,validation- all of them . The ```Flickr_Text``` folder has some ```txt``` file , We will need four text files
* Flickr8k.token.txt
* Flickr_8k.trainImages.txt
* Flickr_8k.devImages.txt
* Flickr_8k.testImages.txt

``` Flickr8k.token.txt``` contains 4 captions for every image name . ```Flickr_8k.trainImages.txt``` contains the names of train images ,```Flickr_8k.devImages.txt``` and ```Flickr_8k.testImages.txt``` contain validation and test image names consequently.

The job of Data preprocessing here is as following 
* Extract features from every image and save them in a pickle file
* Extract captions for every image from ```Flickr8k.token.txt``` file and save them as dictionary
* Separate image names with their captions for every dataset


#### Importing Packages

In [None]:
import keras
import pickle
from keras.applications.vgg16 import VGG16
from keras.applications import Xception,InceptionV3,InceptionResNetV2
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
import numpy as np
import pickle
import os

Function to load the VGG16 Model

In [None]:
def prev_model():
    model= VGG16()
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    model.summary()
    return model

### Data Preprocessing

Directory of the Flicker8k_Dataset

In [None]:
directory = 'dataset/Flicker8k_Dataset'

Function for extracting Features. 
* if the features already available it will return the extracted features file
* else Extract features from every image
* Save the Extracted features of All images in one pickle file

In [None]:
def extract_features(directory,model_name):
    if os.path.exists('dataset/features_%s.pkl'%(model_name)):
        print("Features file already exists")
        features = pickle.load(open('dataset/features_%s.pkl'%(model_name)))
        return features
    else:
        model = prev_model()
        image_names = os.listdir(directory) 
        features = dict()
        for image_name in image_names:
            image =load_img(directory+'/'+image_name,target_size=(224,224))
            image = img_to_array(image)
            image = preprocess_input(image)
            image = image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
            feature = model.predict(image)
            features[image_name] = feature

            print('%s done!'%(image_name))
        pickle.dump(features,open('dataset/features_%s.pkl'%(model_name),'w')) 
        return features

In [None]:
features = extract_features(directory,'VGG16')


A single Feature vector dimension. Should be (4096,)

Function for Extracting captions from caption text files

In [None]:
def extract_captions(filename):
    if os.path.exists('dataset/data_with_captions.pkl'):
        print('Data with Captions file already exists')
        dataset = pickle.load(open('dataset/data_with_captions.pkl'))
        return dataset
    else:
        text = open(filename).read()
        text = text.split(('\n'))
        captions=[]
        image_names=[]
        dataset=dict()
        for line in text:
            if len(line)<1:
                break
            #print(line.split('\t')[0].split('#')[0])
            caption = line.split('\t')[1]
            image_name = line.split('\t')[0].split('#')[0]
            if image_name not in dataset:
                dataset[image_name] =[]
            else:
                dataset[image_name].append(caption.lower())
                #dataset[image_name] =' '.join(dataset[image_name])
            print(image_name+" done!")
        pickle.dump(dataset,open('dataset/data_with_captions.pkl','w'))

        return dataset


In [None]:
filename= 'dataset/Flickr_Text/Flickr8k.token.txt'

data_with_captions= extract_captions(filename)


In [None]:
train_filename = 'dataset/Flickr_Text/Flickr_8k.trainImages.txt' 
dev_filename='dataset/Flickr_Text/Flickr_8k.devImages.txt'
test_filename = 'dataset/Flickr_Text/Flickr_8k.testImages.txt'



Function for loading dataset. That is, making list of names of each imageset folder.

In [None]:
def load_dataset(filename):
    image_names = open(filename).read().split('\n')
    for image_name in image_names:
        if len(image_name)<1:
            image_names.remove(image_name)
    return image_names


In [None]:
train = load_dataset(train_filename)
dev= load_dataset(dev_filename)
test=load_dataset(test_filename)


Getting features for every dataset. That is, to extract features for images of every dataset

In [None]:
def get_features(dataset):
    features_dict= dict()
    features = pickle.load(open('features_VGG16.pkl'))
    for image_name in dataset:
        features_dict[image_name]= features[image_name]
    return features_dict

In [None]:
train_features_set = get_features(train)
test_features_set = get_features(test)
dev_features_set = get_features(dev)

## Now Stop! Take a deep breath, and start again!

Previous steps were just the starting. Now what comes is a bit tough. Please go slowly , and try to understand how it will work

The architecture will work in a different way. First, have a look at the following image

![SkateBoard](Caption0.jpg)

What the model will do, is it will take the image feature and a trigger word ,  $start$ . Then it will predict the next word, which is in our case $a$ . Then it will merge the feature with trigger word and the first predicted word . Then, it will predict second word , which is in our case $skateborder$ . It will continue to do so until it reaches the final trigger, which will be in our case $end$.

Please have a look at the following flow chart

![FlowChart](FlowChart2.jpeg)

### So we , need to process the dataset , again!

So to train the Model , we need data like this
* Feature + 'start' , Prediction 'a'
* Feature + 'start a' , Prediction 'skateboarder'
* Feature + 'start a skateboarder' , Prediction 'does'
* Feature + 'start a skateboarder does' , Prediction 'a'
* Feature + 'start a skateboarder does a', Predition 'trick'
* Feature + 'start a skateboarder does a trick', Prediction 'on'
* Feature + 'start a skateboarder does a trick on' , Prediction 'a'
* Feature + 'start a skateboarder does a trick on a ', Prediction 'ramp'
* Feature + 'start a skateboarder does a trick on a ramp' , Prediction 'end'

So, we need to
* Add $start$ and $end$ with every caption .
* Split the captions 
* Increase the features according to possible combinations of "Feature+ caption"
* Tokenize the captions
* Calculate maximum length of all captions
* Pad the captions with zeros which are less than the maximum length


### Back to Code

Adding $start$ and $end$

In [None]:
for desc_list in data_with_captions.values():
    for d in desc_list:
        desc_list[desc_list.index(d)] = 'start '+d+' end.'

Function for loading descriptions for each dataset

In [None]:
def load_desc(data):
    desc =dict()
    for image_name in data:
        desc[image_name]=data_with_captions[image_name]
    return desc

In [None]:
train_desc = load_desc(train)
dev_desc = load_desc(dev)
test_desc = load_desc(test)

Getting the vocabulary

In [None]:
texts = data_with_captions.values()
texts = ' '.join([' '.join(text) for text in texts])
texts_list = texts.split()
vocab = sorted(set(texts_list))
vocab_size=len(vocab)+1
print('Vocabulary Size %d'%vocab_size)

Saving Vocabulary for future use

In [None]:
pickle.dump(vocab,open('vocabulary.pkl','w'))

Tokenizing the words. You can use tokenizer class from ```keras```. But It didn't work very well in my case, I dont know why. 

In [None]:
i2w = dict((i,c)for i,c in enumerate(vocab))
w2i = dict((c,i)for i,c in enumerate(vocab))

Maximum length

In [None]:
max_length =max(max([[len(d.split()) for d in ls] for ls in train_desc.values()]))

print("Maximum Length %d"%(max_length))


Function for Encoding the Captions according to self made tokenizer

In [None]:
def encode_desc(description):
#if True:
    #description = train_desc
    encoded_list = []
    encoded_list_extend=[]
    out_list=[]
    encoded =dict()
    for key in description.keys():
        caps_encoded =[[w2i[word] for word in cap.split()] for cap in description[key]]
        encoded[key] = caps_encoded
        for cap in description[key]:
            encoded_list.append([w2i[word] for word in cap.split()])
    for ls in encoded_list:
        j=1
        for i in range(1,len(ls)):
            word_encode= pad_sequences([ls[:i]],maxlen=max_length,padding='pre')
            out = ls[i]
            encoded_list_extend.append(word_encode.tolist())
            out_list.append(out)
            print('.'*j+'\r'),
            j=j+1
    return encoded,encoded_list_extend,out_list


In [None]:
train_desc_encoded,train_desc_encoded_list,train_out = encode_desc(train_desc)
dev_desc_encoded,dev_desc_encoded_list,dev_out = encode_desc(dev_desc)
test_desc_encoded,test_desc_encoded_list,test_out = encode_desc(test_desc)

train_desc_encoded

train_desc_encoded_list

train_desc_encoded_np = np.array(train_desc_encoded_list)
dev_desc_encoded_np = np.array(dev_desc_encoded_list)
test_desc_encoded_np = np.array(test_desc_encoded_list)
print("Training array shape "+str(train_desc_encoded_np.shape))
print("Dev array shape "+str(dev_desc_encoded_np.shape))
print("Test array shape "+str(test_desc_encoded_np.shape))

train_desc_encoded_np = np.reshape(train_desc_encoded_np,(-1,train_desc_encoded_np.shape[2]))
dev_desc_encoded_np = np.reshape(dev_desc_encoded_np,(-1,dev_desc_encoded_np.shape[2]))
test_desc_encoded_np = np.reshape(test_desc_encoded_np,(-1,test_desc_encoded_np.shape[2]))
print("Training array shape "+str(train_desc_encoded_np.shape))
print("Dev array shape "+str(dev_desc_encoded_np.shape))
print("Test array shape "+str(test_desc_encoded_np.shape))

train_out_np = np.array(train_out)
dev_out_np = np.array(dev_out)
test_out_np = np.array(test_out)
print("Training Output array shape "+str(train_out_np.shape))
print("Dev Output array shape "+str(dev_out_np.shape))
print("Test Output array shape "+str(test_out_np.shape))

Function for preparing features , again

In [None]:
def prepare_features(features_set,description_encoded):
    x1=[]

    #features_set = train_features_set
    #description_encoded = train_desc_encoded
    for key,values in features_set.items():
        photo_descs = description_encoded[key]

        j=0
        for desc in photo_descs:
            j=j+1
            for i in range(1,len(desc)):

                #in_seq = pad_sequences([desc[:i]],maxlen=max_length)[0]
                #out_seq = np_utils.to_categorical(desc[i],num_classes=len(vocab)+1)[0]
                x1.append(features_set[key][0])
                #x2.append(in_seq)
                #y.append(out_seq)

    return x1

Plotting model

In [None]:
plot_model(full_model, to_file='model_new.png', show_shapes=True)

It should give the following image

![Model](model.png)

Training the model

In [None]:
len(train_x1)
batch_size=1024
#dev_x1 = dev_x1[:-batch_size]
epochs = 20
for epoch in range(epochs):
    for i in range(0,len(train_x1),batch_size):
        X1train = np.array(train_x1[i:i+batch_size])
        X2train = train_desc_encoded_np[i:i+batch_size]
        ytrain = train_out_np[i:i+batch_size]
        X1test = np.array(dev_x1)
        X2test = dev_desc_encoded_np
        ytest = dev_out_np
        full_model.fit([X1train, X2train], ytrain, verbose=0,batch_size=batch_size,validation_data=([X1test, X2test], ytest))
    train_loss =full_model.evaluate(x=[X1train, X2train], y=ytrain,verbose=0)
    Val_loss = full_model.evaluate(x=[X1test, X2test], y=ytest,verbose=0)
    print("Epoch %d , Train Loss %f and Val Loss %f"%(epoch,train_loss,Val_loss))

#full_model.evaluate(x=[X1train,X2train],y=ytrain)
full_model.evaluate(x=[X1test,X2test],y=ytest)

model_json = full_model.to_json()
with open("/output/Caption_model_VGG16.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
full_model.save_weights("/output/Caption_model_VGG16.h5")
print("Saved model to disk")


## Test The model

In [None]:
def prev_model():
    model= VGG16()
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    #model.summary()
    return model

vocab = pickle.load(open('vocabulary.pkl'))
vocab_size=len(vocab)+1
print('Vocabulary Size %d'%vocab_size)
i2w = dict((i,c)for i,c in enumerate(vocab))
w2i = dict((c,i)for i,c in enumerate(vocab))

max_length =35
print("Maximum Length %d"%(max_length))

from keras.models import model_from_json
json_file = open('output/Caption_model_VGG16.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("output/Caption_model_VGG16.h5")
print("Loaded model from disk")

model =prev_model()
def get_img_feature(filename,model=model):
    
    img = load_img(filename,target_size=(224,224))
    img = img_to_array(img)
    img = preprocess_input(img)
    img = img.reshape((1,img.shape[0],img.shape[1],img.shape[2]))
    feature = model.predict(img)
    return feature

test_dir ='Test Images'
image_names =os.listdir(test_dir)

for image_name in image_names:
    
    image_name_full = test_dir+'/'+image_name

    features = get_img_feature(image_name_full)
    in_text ='start'
    in_text_encode = [w2i[in_text]]

    seq = pad_sequences([in_text_encode],maxlen=max_length)
    output =[]
    print(image_name)
    for i in range(max_length): 
        yoh = loaded_model.predict(x=[features,seq])
        word_indice = np.argmax(yoh)
        if i2w[word_indice]=='end.':
            continue
        else:
        
            output.append(i2w[word_indice])
            seq = seq.tolist()[0]
            seq.remove(seq[0])
            seq.append(word_indice)
            seq =np.array([seq])
        
    out_text =' '.join(output)
    im = plt.imread(image_name_full)
    plt.figure()
    plt.imshow(im)
    plt.title(out_text)
    plt.savefig('Caption%d.jpg'%(image_names.index(image_name)))