In [3]:
import pickle
import numpy as np

`Read all necessary files that we have save for future using purpose`

- caption_dict -> Caption_description.pkl
- train_caption -> train_description.pkl
- word_to_idx -> Word_to_idx.pkl
- idx_to_word -> Idx_to_word.pkl
- emb_mat -> embedding_matrix.npy

In [4]:
with open('./Caption_description.pkl', 'rb') as f:
    caption_dict = pickle.load(f)
    
with open('./train_description.pkl', 'rb') as f:
    train_caption = pickle.load(f)
    
with open('./Word_to_idx.pkl','rb') as f:
    word_to_idx = pickle.load(f)
    
with open('./Idx_to_word.pkl','rb') as f:
    idx_to_word = pickle.load(f)
    
with open('./embedding_matrix.npy','rb') as f:
    emb_mat = np.load(f)

In [3]:
emb_mat.shape

(2968, 50)

In [4]:
# Find max length caption

max_length = 0
for img_name in train_caption.keys():
    for caption in train_caption[img_name]:
        
        max_length = max(max_length,len(caption.split()))
        
print(max_length)

33


In [5]:
vocab_size =len(word_to_idx)
print(vocab_size)

2967


In [6]:
# Import extracted training images features

with open('./Encoded features/encoded_train_feature.pkl','rb') as f:
    encoding_train = pickle.load(f)

### Data Generator

- This function will predict next word from past sequence of words
    - Example
        - This
        - This is
        - This is a
        - This is a ball
        
From the above example we can understand that what our model should predict

`Note:` We will not pass word, inspite we will pass interger value which will represnt word

In [7]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [8]:
def data_generator(train_caption,encoding_train,word_to_idx,batch_size,max_length):
    x1,x2,y = [],[],[]        #here we will pass photo, word sequence, output
    
    n=0
    while True:
        for key,caption_list in train_caption.items():
            n += 1
            encoded_img = encoding_train[key]       #find images encoeded features
            for caption in caption_list:
                seq = [word_to_idx[word] for word in caption.split() if word in word_to_idx.keys()]
                
                for i in range(0,len(seq)):
                    xi = seq[0:i]
                    yi = seq[i]
                    
                    xi = pad_sequences([xi],maxlen=max_length,value=0,padding='post')[0]
                    #we are using padding after x2 to make same length
                    
                    yi = to_categorical([yi],num_classes=vocab_size+1)[0]   
                    # neural networks accept on-hot vector
                    
                    x1.append(encoded_img)
                    x2.append(xi)
                    y.append(yi)
                    
                    
                if n==batch_size:
                    
                    x1 = np.asarray(x1)
                    x2 = np.asarray(x2)
                    y = np.asarray(y)
                    
                    """x1 = np.asarray(x1).astype(np.float32)
                    x2 = np.asarray(x2).astype(np.float32)
                    y = np.asarray(y)#.astype(np.float32)"""
                    
                    yield ([x1,x2],y)    #pass this values
                    x1,x2,y=[],[],[]  # we do not want to add examples which are there in previous batch 
                    n=0

### Model Architecture

In [9]:
from keras.models import Model
from keras.layers import *
import os
import glob

`The Model Architecture is not a Sequential model`

In [10]:
# To handle Images
img_input = Input(shape=(2048,))     #sahpe of extracted features of images, is taken as input
img_inp1 = Dropout(0.3)(img_input)
img_inp2 = Dense(256,activation='relu')(img_inp1)   #coverting input dim(2048) to 256dim

# To handle Cptions
caption_input = Input(shape=(max_length,))          #Length of the vocab is the input shape o captions
cap_inp1 = Embedding(input_dim=vocab_size+1,output_dim=50,mask_zero=True)(caption_input)
cap_inp2 = Dropout(0.3)(cap_inp1)
cap_inp3 = LSTM(256)(cap_inp2)

In [11]:
# main Model
# Here we have to make a decoder which will take input from two concatenated vectors

decoder1 = add([img_inp2,cap_inp3]) #this layer will take pne input from the image and the other is from the captions
decoder2 = Dense(256,activation='relu')(decoder1)
outputs = Dense(vocab_size+1,activation='softmax')(decoder2)

#  Combine Model
model = Model([img_input,caption_input],outputs=outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 33, 50)       97950       input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 2048)         0           input_1[0][0]                    
______________________________________________________________________________________________

In [12]:
# Set embedding layer weights

model.layers[2].set_weights([emb_mat])   #we are setting embedding layers weight which we get from embedding matrix
model.layers[2].trainable = False

In [13]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [14]:
# Train model

batch_size = 3
epochs = 10
images_per_batch = 3
steps = len(train_caption)//images_per_batch

try:
    os.mkdir('./model_weights')
    print('Directory Created')
except:
    print('Directory exists')
    files = glob.glob('./model_weights/*')
    for f in files:
        os.remove(f)
    
    
def train():
    
    for i in range(epochs):
        generator = data_generator(train_caption,encoding_train,word_to_idx,batch_size,max_length)
        h = model.fit(generator,epochs=1,steps_per_epoch=steps,verbose=1)
        model.save(f'./model_weights/model_{i+1}.h5')

Directory exists


In [15]:
train()

