In [1]:
import pickle
import numpy as np

`Read all necessary files that we have save for future using purpose`

- caption_dict -> Caption_description.pkl
- train_caption -> train_description.pkl
- word_to_idx -> Word_to_idx.pkl
- idx_to_word -> Idx_to_word.pkl
- emb_mat -> embedding_matrix.npy

In [2]:
with open('./Caption_description.pkl', 'rb') as f:
    caption_dict = pickle.load(f)
    
with open('./train_description.pkl', 'rb') as f:
    train_caption = pickle.load(f)
    
with open('./Word_to_idx.pkl','rb') as f:
    word_to_idx = pickle.load(f)
    
with open('./Idx_to_word.pkl','rb') as f:
    idx_to_word = pickle.load(f)
    
with open('./embedding_matrix.npy','rb') as f:
    emb_mat = np.load(f)

In [3]:
emb_mat.shape

(1959, 50)

In [4]:
# Find max length caption

max_length = 0
for img_name in train_caption.keys():
    for caption in train_caption[img_name]:
        
        max_length = max(max_length,len(caption.split()))
        
print(max_length)

33


In [5]:
vocab_size =len(word_to_idx)
print(vocab_size)

1958


In [6]:
# Import extracted training images features

with open('./Encoded features/encoded_train_feature.pkl','rb') as f:
    encoding_train = pickle.load(f)

{1: 'child',
 2: 'in',
 3: 'pink',
 4: 'dress',
 5: 'is',
 6: 'climbing',
 7: 'up',
 8: 'set',
 9: 'of',
 10: 'stairs',
 11: 'an',
 12: 'way',
 13: 'girl',
 14: 'going',
 15: 'into',
 16: 'wooden',
 17: 'building',
 18: 'little',
 19: 'the',
 20: 'to',
 21: 'her',
 22: 'black',
 23: 'dog',
 24: 'and',
 25: 'spotted',
 26: 'are',
 27: 'fighting',
 28: 'tri',
 29: 'colored',
 30: 'playing',
 31: 'with',
 32: 'each',
 33: 'other',
 34: 'on',
 35: 'road',
 36: 'white',
 37: 'brown',
 38: 'spots',
 39: 'staring',
 40: 'at',
 41: 'street',
 42: 'two',
 43: 'dogs',
 44: 'different',
 45: 'looking',
 46: 'pavement',
 47: 'moving',
 48: 'toward',
 49: 'covered',
 50: 'paint',
 51: 'sits',
 52: 'front',
 53: 'painted',
 54: 'rainbow',
 55: 'hands',
 56: 'bowl',
 57: 'sitting',
 58: 'large',
 59: 'small',
 60: 'grass',
 61: 'plays',
 62: 'it',
 63: 'there',
 64: 'pigtails',
 65: 'painting',
 66: 'young',
 67: 'outside',
 68: 'man',
 69: 'lays',
 70: 'bench',
 71: 'while',
 72: 'his',
 73: 'by',
 

### Data Generator

- This function will predict next word from past sequence of words
    - Example
        - This
        - This is
        - This is a
        - This is a ball
        
From the above example we can understand that what our model should predict

`Note:` We will not pass word, inspite we will pass interger value which will represnt word

In [18]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [7]:
`def data_generator(train_caption,encoding_train,word_to_idx,batch_size,max_length):
    x1,x2,y = [],[],[]        #here we will pass photo, word sequence, output
    
    n=0
    while True:
        for key,capttion_list in train_caption.items():
            n += 1
            encoded_img = encoding_train[key]       #find images encoeded features
            for caption in caption_list:
                seq = [word_to_idx[word] for word in caption.split() if word in word_to_idx.keys()]
                
                for i in range(0,len(seq)):
                    xi = seq[0:i]
                    yi = seq[i]
                    
                    xi = pad_sequences([xi],maxlen=max_length,value=0,padding='post')[0]
                    #we are using padding after x2 to make same length
                    
                    yi = to_categorical(yi)   
                    # neural networks accept on-hot vector
                    
                    x1.append(img)
                    x2.append(xi)
                    y.append(yi)
                    
                if n==batch_size:
                    yield [[np.array(x1),np.array(x2)],np.array(y)]    #pass this values
                    x1,x2,y=[],[],[]  # we do not want to add examples which are there in previous batch 
                    n=0

### Model Architecture

In [8]:
from keras.models import Model
from keras.layers import *

`The Model Architecture is not a Sequential model`

In [9]:
# To handle Images
img_input = Input(shape=(2048,))     #sahpe of extracted features of images, is taken as input
img_inp1 = Dropout(0.3)(img_input)
img_inp2 = Dense(256,activation='relu')(img_inp1)   #coverting input dim(2048) to 256dim

# To handle Cptions
caption_input = Input(shape=(max_length,))          #Length of the vocab is the input shape o captions
cap_inp1 = Embedding(input_dim=vocab_size+1,output_dim=50,mask_zero=True)(caption_input)
cap_inp2 = Dropout(0.3)(cap_inp1)
cap_inp3 = LSTM(256)(cap_inp2)

In [10]:
# main Model
# Here we have to make a decoder which will take input from two concatenated vectors

decoder1 = add([img_inp2,cap_inp3]) #this layer will take pne input from the image and the other is from the captions
decoder2 = Dense(256,activation='relu')(decoder1)
outputs = Dense(vocab_size+1,activation='softmax')(decoder2)

#  Combine Model
model = Model([img_input,caption_input],outputs=outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 33, 50)       97950       input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 2048)         0           input_1[0][0]                    
______________________________________________________________________________________________

In [11]:
# Set embedding layer weights

model.layers[2].set_weights([emb_mat])   #we are setting embedding layers weight which we get from embedding matrix
model.layers[2].trainable = False

In [12]:
model.compile(loss='categorical_crossentropy',optimizer='adam')