In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
import pandas as pd

In [37]:
from keras.preprocessing import image, sequence
from keras.applications import VGG16
from keras.layers import Dense, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector, Merge
from keras.models import Sequential, Model
from keras.optimizers import Adam

In [3]:
pd_dataset = pd.read_csv("./Flickr8k_text/flickr_8k_train_dataset.txt", delimiter='\t')
ds = pd_dataset.values
print ds.shape

(30000, 2)


In [4]:
sentences = []
for ix in range(ds.shape[0]):
    sentences.append(ds[ix, 1])
    
print len(sentences)

30000


In [5]:
words = [i.split() for i in sentences]

In [6]:
unique = []
for i in words:
    unique.extend(i)

In [7]:
unique = list(set(unique))
print len(unique)

vocab_size = len(unique)

8253


In [8]:
#Vectorization
word_2_indices = {val:index for index, val in enumerate(unique)}
indices_2_word = {index:val for index, val in enumerate(unique)}

In [9]:
word_2_indices['UNK'] = 0
word_2_indices['raining'] = 8253

indices_2_word[0] = 'UNK'
indices_2_word[8253] = 'raining'

In [10]:
print word_2_indices['<start>']
print indices_2_word[4011]
print word_2_indices['<end>']
print indices_2_word[8051]

4011
<start>
8051
<end>


In [11]:
vocab_size = len(word_2_indices.keys())
print vocab_size

8254


### Model

In [12]:
captions = np.load("./captions.npy")
next_words = np.load("./next_words.npy")

print captions.shape
print next_words.shape

(25493, 40)
(25493, 8254)


In [25]:
images = np.load("./images.npy")

print images.shape

(25493, 2048)


In [24]:
image = np.load("./image_names.npy")
        
print image.shape

(25493,)


In [18]:
embedding_size = 128
max_len = 40

In [27]:
image_model = Sequential()

image_model.add(Dense(embedding_size, input_shape=(2048,), activation='relu'))
image_model.add(RepeatVector(max_len))

image_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 128)               262272    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 40, 128)           0         
Total params: 262,272
Trainable params: 262,272
Non-trainable params: 0
_________________________________________________________________


In [28]:
language_model = Sequential()

language_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_len))
language_model.add(LSTM(256, return_sequences=True))
language_model.add(TimeDistributed(Dense(embedding_size)))

language_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 128)           1056512   
_________________________________________________________________
lstm_4 (LSTM)                (None, 40, 256)           394240    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 40, 128)           32896     
Total params: 1,483,648
Trainable params: 1,483,648
Non-trainable params: 0
_________________________________________________________________


In [43]:
adam = Adam(lr=0.01)

In [44]:
model = Sequential()

model.add(Merge([image_model, language_model], mode='concat', concat_axis=-1))
model.add(LSTM(128, return_sequences=True))
model.add(Bidirectional(LSTM(256, return_sequences=False)))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

  This is separate from the ipykernel package so we can avoid doing imports until


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_9 (Merge)              (None, 40, 256)           0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 40, 128)           197120    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1024)              2625536   
_________________________________________________________________
dense_14 (Dense)             (None, 8254)              8460350   
_________________________________________________________________
activation_9 (Activation)    (None, 8254)              0         
Total params: 13,028,926
Trainable params: 13,028,926
Non-trainable params: 0
_________________________________________________________________


In [None]:
hist = model.fit([images, captions], next_words, batch_size=512, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
 3584/25493 [===>..........................] - ETA: 3:06 - loss: 7.1117 - acc: 0.0784

In [None]:
model.save_weights("./model_weights.h5")