In [1]:
# IMPORT MODULES
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [2]:
# LOAD VGG MODEL
model=VGG16()
# Restructure the model
model=Model(inputs=model.inputs,outputs=model.layers[-2].output)
# summarize
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [3]:
# Extract features from image
# key is image id and value is the features
features={}
directory=os.path.join('Images')
for img_name in tqdm(os.listdir(directory)):
    # load the image
    filepath=directory+'/'+img_name
    image=load_img(filepath,target_size=(224,224))
    #convert image pixels to numpy array
    image=img_to_array(image)
    #reshaping the data for the model
    image=image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
    #preprocess the image for vgg 
    image=preprocess_input(image)
    #extract features
    feature=model.predict(image,verbose=0)
    #get image ID
    image_id=img_name.split('.')[0]
    #store feature
    features[image_id]=feature

  0%|          | 0/8091 [00:00<?, ?it/s]

In [11]:
#store features in pickle
pickle.dump(features,open(os.path.join('features.pkl'),'wb'))

In [12]:
#load feature from pickle
with open(os.path.join('features.pkl'),'rb') as f:
    features=pickle.load(f)

In [13]:
# Load the captions data
with open(os.path.join('captions.txt'),'r') as f:
    next(f)
    captions_doc=f.read()

In [14]:
#Create mapping of image to captions
mapping={}
#process lines
for line in tqdm(captions_doc.split('\n')):
    #split the lines by comma
    tokens=line.split(',')
    if len(line)<2:
        continue
    elif len(line)>2:
        image_id=tokens[0].split('.')[0]
        caption=" ".join(tokens[1:])
        if image_id not in mapping:
            mapping[image_id]=[]
            #store the caption
        mapping[image_id].append(caption)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [15]:
len(mapping)

8091

In [19]:
import re
import string
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # removing numbers from captions
            caption=re.sub(r'[0 - 9]', ' ', caption)
            #removing special characters from captions
            pattern = r'[' + string.punctuation + ']'
            #Remove special characters from the string
            caption = re.sub(pattern, '', caption)
            # removing additional spaces
            caption = re.sub(' +',' ',caption)
            # add start and end tags to the caption
            caption = '<startseq> ' + " ".join([word for word in caption.split() if len(word)>1]) + ' <endseq>'
            captions[i] = caption

In [20]:
#preprocess the text
clean(mapping)

In [21]:
mapping['1002674143_1b742ab4b8']

['<startseq> little girl covered in paint sits in front of painted rainbow with her hands in bowl <endseq>',
 '<startseq> little girl is sitting in front of large painted rainbow <endseq>',
 '<startseq> small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it <endseq>',
 '<startseq> there is girl with pigtails sitting in front of rainbow painting <endseq>',
 '<startseq> young girl with pigtails painting outside in the grass <endseq>']

In [22]:
all_captions=[]
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [23]:
all_captions[:10]

['<startseq> child in pink dress is climbing up set of stairs in an entry way <endseq>',
 '<startseq> girl going into wooden building <endseq>',
 '<startseq> little girl climbing into wooden playhouse <endseq>',
 '<startseq> little girl climbing the stairs to her playhouse <endseq>',
 '<startseq> little girl in pink dress going into wooden cabin <endseq>',
 '<startseq> black dog and spotted dog are fighting <endseq>',
 '<startseq> black dog and tricolored dog playing with each other on the road <endseq>',
 '<startseq> black dog and white dog with brown spots are staring at each other in the street <endseq>',
 '<startseq> two dogs of different breeds looking at each other on the road <endseq>',
 '<startseq> two dogs on pavement moving toward each other <endseq>']

In [24]:
#Tokenize the text
tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size=len(tokenizer.word_index)+1
print(vocab_size)

8797


In [25]:
#get maximum length of the caption available
max_length=max(len(caption.split()) for caption in all_captions)
max_length

34

In [26]:
# Train Test Split
image_ids=list(mapping.keys())
split=int(len(image_ids)*0.90)
split
train=image_ids[:split]
test=image_ids[split:]

In [27]:
#Data Generator
def data_generator(data_keys,mapping,features,tokenizer,max_length,vocab_size,batch_size):
    X1,X2,y=list(),list(),list()
    n=0
    while 1:
        for key in data_keys:
            n=n+1;
            captions=mapping[key]
            for caption in captions:
                seq=tokenizer.texts_to_sequences([caption])[0]
                #split the sequence in x,y pairs
                # x is the input and y is the output
                for i in range(1,len(seq)):
                    #split into input and output pairs
                    in_seq,out_seq=seq[:i],seq[i]
                    #pad input sequence
                    in_seq=pad_sequences([in_seq],maxlen=max_length)[0]
                    #encode output sequence
                    out_seq=to_categorical([out_seq],num_classes=vocab_size)[0]
                    
                    #store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n==batch_size:
                X1,X2,y=np.array(X1),np.array(X2),np.array(y)
                yield [X1,X2],y
                X1,X2,y=list(),list(),list()
                n=0

In [None]:
# Model Creation

In [28]:
#Encoder model
#Image feature layers
inputs1=Input(shape=(4096,))
fe1=Dropout(0.4)(inputs1)
fe2=Dense(256,activation='relu')(fe1)
#sequence feature layers
inputs2=Input(shape=(max_length,))
se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
se2=Dropout(0.4)(se1)
se3=LSTM(256)(se2)

#Decoder model
decoder1=add([fe2,se3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size,activation='softmax')(decoder2)
model=Model(inputs=[inputs1,inputs2],outputs=outputs)
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
#train the model
epochs=15
batch_size=64
steps=len(train)//batch_size
#it means it will do  the back propagation and fetch the next data
for i in range(epochs):
    #create data generator
    generator=data_generator(train,mapping,features,tokenizer,max_length,vocab_size,batch_size)
    #fit for one epoch
    model.fit(generator,epochs=1,steps_per_epoch=steps,verbose=1)



In [None]:
#save the model
model.save('model.h5')

In [None]:
#Generate Captions for the image
def idx_to_word(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None
    

In [None]:
#generate caption for an image
def predict_caption(model,image,tokenizer,max_length):
    #add start tag for generation process
    in_text='<startseq>'
    #iterate over the max length of sequence
    for i in range(max_length):
        #encode the input sequence
        sequence=tokenizer.texts_to_sequences([in_text])[0]
        #pad the sequences
        sequence=pad_sequences([sequence],max_length)
        #predict the next word
        yhat=model.predict([image,sequence],verbose=0)
        #get index with highest probablity
        yhat=np.argmax(yhat)
        #convert index to word
        word=idx_to_word(yhat,tokenizer)
        #stop if word not found
        if word is None:
            break
        #append word as input for generating next word
        in_text+=" "+word
        #stop if we reach end tag
        if word=='<endseq>':
            break
    return in_text


In [None]:
from nltk.translate.bleu_score import corpus_bleu
#validation
actual,predicted=list(),list()
for key in tqdm(test):
    #get actual captions
    captions=mapping[key]
    #predict the caption for image 
    y_pred=predict_caption(model,feature[key],tokenizer,max_length)
    #split into word
    actual_captions=[captions.split() for caption in captions]
    y_pred=y_pred.split()
    #append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)
#calculate BLEU score
print("BLEU-1 %f" % corpus_bleu(actual,predicted,weights=(1.0,0,0,0,0)))
print("BLEU-2 %f" % corpus_bleu(actual,predicted,weights=(0.5,0.5,0,0,0)))

In [None]:
#visualize the image 
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
    image_id=image_name.split('.')
    img_path=os.path.join("Images",image_name)
    image=Image.open(img_path)
    captions=mapping[image_id]
    print("ACTUAL")
    for caption in captions:
        print(caption)
    #predict the caption
    y_pred=predict_caption(model,feature[image_id],tokenizer,max_length)
    print("PREDICTED")
    print(y_pred)
    plt.imshow(image)

In [None]:
generate_caption("1001773457_577c3a7d70.jpg")