In [1]:
import os
import pickle
import numpy as np
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.preprocessing.image import load_img,img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout,add
from tqdm.notebook import tqdm





In [55]:
model=VGG16()
model=Model(inputs=model.inputs,outputs=model.layers[-2].output)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

EXTRACT FEATURES FROM IMAGES 

In [56]:
features={}
dir=os.path.join('D:/aiml/Images')
for img in tqdm(os.listdir(dir)):
    img_path=dir+'/'+img
    image=load_img(img_path,target_size=(224,224))
    image=img_to_array(image)
    image=image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
    #preprocess od image
    image=preprocess_input(image)
    feature=model.predict(image,verbose=0)
    img_id=img.split('.')[0]
    features[img_id]=feature

  0%|          | 0/8091 [00:00<?, ?it/s]

In [57]:
pickle.dump(feature,open(os.path.join('feature.pkl'),'wb'))

In [None]:
with open(os.path.join(working_dir,'feature.pkl'),'rb') as f:
    features pickle.load(f)
    

In [85]:
file_path_txt = "D:/aiml/captions.txt"
with open(os.path.join('D:/aiml/captions.txt'),'r') as f:
    next(f)
    captions_doc=f.read()

In [86]:
mapping={}
for line in captions_doc.split('\n'):
    tokens=line.split('.')
    img_id ,caption=tokens[0],tokens[1:]
    img_id=img_id.split('.')[0]
    caption=" ".join(caption)
    if img_id not in mapping:
        mapping[img_id]=[]
    mapping[img_id].append(caption) 
    

In [87]:
print(len(mapping))

8092


In [90]:
def cleaning(mapping):
    for key,captions in mapping.items():
        for i in range(0,len(captions)):
            caption=captions[i]
            caption=caption.lower()
            caption=caption.replace('jpg,','')
            caption=caption.replace('start,','')
            caption=caption.replace('[^A-Za-z]','')
            caption=caption.replace('\s',' ')
            caption="startsen "+" ".join([word for word in caption.split() if len(word)>1])+" endsen"
            captions[i]=caption

In [92]:
mapping['1000268201_693b08cb0e']

['startsen child in pink dress is climbing up set of stairs in an entry way endsen',
 'startsen girl going into wooden building endsen',
 'startsen little girl climbing into wooden playhouse endsen',
 'startsen little girl climbing the stairs to her playhouse endsen',
 'startsen little girl in pink dress going into wooden cabin endsen']

In [91]:
cleaning(mapping)

In [93]:
all_captions=[]
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [94]:
len(all_captions)

40456

In [95]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size=len(tokenizer.word_index)+1

In [96]:
vocab_size

8477

In [97]:
mx_len=max(len(caption.split()) for caption in all_captions)
mx_len

34

In [98]:
img_id=list(mapping.keys())
split=int(len(img_id)*0.90)
train = img_id [:split]
test = img_id [split:]

In [99]:
# create data generator to get data in batch (avoids session crash)
def data_generator (data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
#loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions=mapping [key]
            # process each caption
            for caption in captions:
            #encode the sequence
                seq = tokenizer.texts_to_sequences ( [caption]) [0]
                #split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    #split into input and output pairs
                    in_seq, out_seq= seq[:i], seq[i]
                    #pad input sequence
                    in_seq = pad_sequences ([in_seq], maxlen=max_length) [0]
                    # encode output sequence
                    out_seq=to_categorical([out_seq], num_classes=vocab_size) [0]
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n==batch_size:
                X1,X2,y=np.array(X1),np.array(X2),np.array(y)
                yield[X1,X2],y
                X1,X2,y=list(),list(),list()
                n=0

In [65]:
inputs1=Input(shape=(4096,))
fe1=Dropout(0.4)(inputs1)
fe2=Dense(256,activation='relu')(fe1)
inputs2=Input(shape=(mx_len,))
se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
se2=Dropout(0.4)(se1)
se3=LSTM(256)(se2)

decoder1=add([fe2,se3])
decoder2=Dense(256,activation='relu')(decoder1)
output=Dense(vocab_size,activation='softmax')(decoder2)

model=Model(inputs=[inputs1,inputs2],outputs=output)
model.compile(loss='categorical_crossentropy',optimizer='adam')

plot_model(model,show_shapes=True)



You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [None]:
epochs=15
batch_size=64
steps=len(train)//batch_size
for i in range (epochs):
#create data generator
    generator = data_generator(train, mapping, features, tokenizer, mx_len, vocab_size, batch_size)
#fit for one epoch
    model.fit (generator, epochs=1, steps_per_epoch=steps, verbose=1)


In [None]:
def idx_to_word(integer,tokenizer):
    for word,index in tokenizer,word_index,items():
        if index==integer:
            return word
    return None

In [70]:
def predict_caption(model, image, tokenizer, max_length) :
# add start tag for generation process
    in_text = '<start>'
    # iterate over the max length of sequence
    for i in range (max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        #pad the sequence
        sequence= pad_sequences([sequence], max_length) 
        # predict next word
        yhat= model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat= np.argmax (yhat)
        #convert index to word
        word = idx_to_word(yhat, tokenizer)
    #stop if word not found
        if word is None:
            break
    # append word as input for generating next word
        in_text += " " + word
    # stop if we reach end tag
        if word =='<end>' :
            break
    return in_text

In [None]:
actual,predicted=list(),list()
for key in test:
    captions=mapping[key]
    y_pred=predict_caption(model,features[key],tokenizer,mx_len)
    actual_caption=[caption.split() for caption in captions]
    y_pred=y_pred.split()
    actual.append(actual_caption)
    predicted.append(y_pred)

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(img_name):
    image_name=""
    img_path=os.path.join("D:\aiml\Images")
    image=Image.open(img_path)
    captions=mapping[img_id]
    for caption in captions:
        print(caption)
    Y_pred=predict_caption(model,features[img_id],tokenizer,mx_len)
    print(Y_pred)
    plt.imshow(image)