In [2]:
import pandas as pd 
import numpy as np 
import os
import pickle

In [4]:
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout,add

In [6]:
model = VGG16()
model = Model(inputs = model.inputs,outputs = model.layers[-2].output)
print(model.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [7]:
BASE_DIR = '/Users/91887/imageClassification/archive (1)'
WORKING_DIR = '/Users/91887/imageClassification'

Extact Image Features

In [9]:
from tqdm.notebook import tqdm

In [11]:
import tensorflow as tf

In [12]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.95
config.gpu_options.visible_device_list = "0"
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))




In [18]:
features = {}
directory = os.path.join(BASE_DIR,'Images')

for img_name in tqdm(os.listdir(directory)):
    #load the image from file 
    img_path = directory+'/'+img_name
    image = load_img(img_path,target_size = (224,224))
    #image pixel to numpy array 
    image = img_to_array(image)
    #reshapre data for model
    image =  image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
    #preprocess image for vgg
    image = preprocess_input(image)
    #Extract the Features
    feature = model.predict(image,verbose=0)
    #get Image ID
    image_id = img_name.split('.')[0]
    #store the Features
    features[image_id] = feature

  0%|          | 0/8091 [00:00<?, ?it/s]

In [19]:
# Store Features in pickle 

pickle.dump(features,open(os.path.join(WORKING_DIR,'features.pkl'),'wb'))

In [20]:
# load Features from pickle 

with open(os.path.join(WORKING_DIR,'features.pkl'),'rb') as f:
    features = pickle.load(f)

In [21]:
with open(os.path.join(BASE_DIR,'captions.txt'),'r') as f:
    next(f)
    captions_doc = f.read()

In [50]:
# Create mapping of images to captions

# Creating Dict name mapping 

mapping = {}

#process lines 

for line in tqdm(captions_doc.split('\n')):
    #Split the line by comma
    tokens = line.split(',')
    if len(line)<2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    #remove exe from image_id
    image_id = image_id.split('.')[0]
    #convert caption list into the string 
    caption =" ".join(caption)
    #Create a list 
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [51]:
len(mapping)

8091

In [52]:
import re

In [53]:
def clean(mapping):
    for key,captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            #Preprocessing steps
            caption = caption.lower()
            caption = caption.replace('[^A-Za-z]', '')  #Delete special character and digit
            caption = caption.replace('\s+',' ')
            # add Start and End tags to the caption
            caption = 'start ' + " ".join([word for word in caption.split() if len(word)>1]) + ' end'
            captions[i] = caption

In [54]:
value = mapping["1000268201_693b08cb0e"]
print(value)

['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .']


In [55]:
clean(mapping)

In [56]:
mapping['1000268201_693b08cb0e']

['start child in pink dress is climbing up set of stairs in an entry way end',
 'start girl going into wooden building end',
 'start little girl climbing into wooden playhouse end',
 'start little girl climbing the stairs to her playhouse end',
 'start little girl in pink dress going into wooden cabin end']

In [57]:
# Create a Tokenizer for getting the index of the corressponding word 
# Create vocabulary size

In [58]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [59]:
len(all_captions)

40455

In [60]:
all_captions[:10]

['start child in pink dress is climbing up set of stairs in an entry way end',
 'start girl going into wooden building end',
 'start little girl climbing into wooden playhouse end',
 'start little girl climbing the stairs to her playhouse end',
 'start little girl in pink dress going into wooden cabin end',
 'start black dog and spotted dog are fighting end',
 'start black dog and tri-colored dog playing with each other on the road end',
 'start black dog and white dog with brown spots are staring at each other in the street end',
 'start two dogs of different breeds looking at each other on the road end',
 'start two dogs on pavement moving toward each other end']

In [61]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

In [62]:
voca_size = len(tokenizer.word_index)+1

In [63]:
voca_size

8483

In [64]:
# Get max length of the caption available
max_len = max(len(caption.split())for caption in all_captions)
max_len

35

Train Test Split

In [65]:
image_ids = list(mapping.keys())
split = int(len(image_ids)*0.80)
split

6472

In [66]:
train = image_ids[:split]
test = image_ids[split:]

In [67]:
def data_generator (data_keys,mapping,features,tokenizer,max_length,voca_size,batch_size):
    #looping over images 
    X1,X2,y = list(),list(),list()
    n =0
    while 1:
        for key in data_keys:
            n+=1;
            captions = mapping[key]
            for caption in captions:
                #Encode the Sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                #Split the sequence into X y pairs 
                for i in range(1,len(seq)):
                    in_seq,out_seq = seq[:i],seq[i]
                    #pad input sequence
                    in_seq = pad_sequences([in_seq],maxlen=max_len)[0]
                    #encode output seq 
                    out_seq = to_categorical([out_seq],num_classes = voca_size)[0]
                    
                    #Store the seq
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
                    
            if n == batch_size:
                X1,X2,y = np.array(X1),np.array(X2),np.array(y)
                yield [X1,X2],y
                X1, X2, y = list(), list(), list()
                n =0

Model Creation

In [68]:
import pydot 

In [69]:
#Encoder Model
#Image Feature layers
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256,activation='relu')(fe1)
#Sequence feature layer
inputs2 = Input(shape =(max_len,))
se1 = Embedding(voca_size,256,mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

#Decoder Model
decoder1 = add([fe2,se3])
decoder2 = Dense(256,activation = 'relu')(decoder1)
outputs = Dense(voca_size,activation='softmax')(decoder2)

model = Model(inputs=(inputs1,inputs2),outputs = outputs)
model.compile(loss = 'categorical_crossentropy',optimizer='adam')

model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_12 (InputLayer)          [(None, 35)]         0           []                               
                                                                                                  
 input_11 (InputLayer)          [(None, 4096)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 35, 256)      2171648     ['input_12[0][0]']               
                                                                                                  
 dropout_8 (Dropout)            (None, 4096)         0           ['input_11[0][0]']               
                                                                                            

In [None]:
#Train the Model
epochs = 15
batch_size = 64
steps = len(train)//batch_size

for i in range(epochs):
    generator = data_generator(train,mapping,features,tokenizer,max_len,voca_size,batch_size)
    model.fit(generator,epochs=1,steps_per_epoch = steps,verbose=1)



In [None]:
#Save the Model 
model.save(WORKING_DIR+'/best_model.h5')

In [None]:
#Generate Captions for the Image 
#Index to Word

def idx_to_word(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index == integer:
            return word 
         return None   

In [None]:
#Generate Caption for the Image 
def predict_caption(model,image,tokenizer,max_length):
    #Add start tag for generation process
    in_text = 'start'
    #iterate over max-length of sequence
    for i in range(max_length):
        #Encode input sequences
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        #Pad the Seequence
        sequence = pad_sequences([sequence], max_length)
        #predict next word 
        yhat = model.predict([image,sequence],verbose = 0)
        #Get index with High Probability
        yhat = np.argmax(yhat)
        #Convert index to word
        word = idx_to_word(yhat,tokenizer)
        if word is None:
            break
        #Append word as input for generating next word
        in_text += " "+word
        #Stop if we reach end Tag
        if word == 'end':
            break
        return in_text

Find BLEU Score 

In [None]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
#Validate with Test Data
actual,predicted = list(),list()

for key in tqdm(test):
    #get Actual Caption
    captions = mapping[key]
    #predict the caption of the Image 
    y_pred = predict_caption(model,features[key],tokenizer,max_length)
    #Split into words 
    y_pred = y_pred.split()
    actual_captions = [caption.split() for caption in captions]
    actual.append(actual_captions)
    predicted.append(y_pred)

#Calculate BLEU score
print("BLEU-1 : %f"%corpus_bleu(actual,predicted,weights=(1.0,0,0,0)))
print("BLEU-2 : %f"%corpus_bleu(actual,predicted,weights=(0.5,0.5,0,0)))

Visualize the Results 

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
    image_name = "1022454332_6af2c1449a.jpg"
    image_id = image_name.split('.')[0]
    image_path = os.path.join(BASE_DIR,'Images',image_name)
    image = Image.open(img_path)
    captions = mapping[image_id]
    print('------------------Actual------------------')
    for caption in captions:
        print(caption)
    y_pred = predict_caption(model,features[image_id],tokenizer,max_length)
    print('------------------Predicted------------------')
    print(y_pred)
    plt.imshow(image)