## Data Pre-Processing

### Importing Dependencies

In [1]:
import scipy
%matplotlib inline
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
import matplotlib.patches as mpatches
import sys
import os

# Import Refexp python class
# Please MAKE SURE that ./google_refexp_py_lib is in your
# python library search path
sys.path.append("../Google_Refexp_toolbox-master/google_refexp_py_lib")
from refexp import Refexp

In [2]:
# Specify datasets path.
refexp_filename='../Google_Refexp_toolbox-master/google_refexp_dataset_release/google_refexp_train_201511_coco_aligned.json'
coco_filename='../Google_Refexp_toolbox-master/external/coco/annotations/instances_train2014.json'
imagesDir = '../Google_Refexp_toolbox-master/external/coco/images'
imagesType = 'train2014'

# Create Refexp instance.
refexp = Refexp(refexp_filename, coco_filename)

loading annotations into memory...
Done (t=20.38s)
creating index...
index created!
Dataset loaded.


### Selecting 20000 images from 10 categories

In [3]:
# Get all images that contain all given categories.
catIds = refexp.getCatIds()
imgIds = []
for i in range(0,10):
    imgIds.append(refexp.getImgIds(catIds=catIds[i]))
imgIds = [item for sublist in imgIds for item in sublist]

print "Loading train images..."
# Returns a tuple of image_ids and array of img_ids
train_images = refexp.loadImgs(imgIds[:20000])
test_images = refexp.loadImgs(imgIds[20000:])

print "Train images loaded.."
# print len(train_images), len(test_images)

Loading train images...
Train images loaded..


### Loading annotation id

In [4]:
ids = train_images[1]
train_annotations = []
for id in ids:
    train_annotations.append(refexp.getAnnIds(id))
print "Training Annotations Loaded.."
# print len(train_annotations)

Training Annotations Loaded..


### Loading images

In [5]:
import cv2
# print len(train_images[0])
print "Loading Images"
Images = []
for img in train_images[0]:
    I = cv2.imread(os.path.join(imagesDir, imagesType, img['file_name']))
    I.resize(3,224,224)
    Images.append(I)
Images = np.asarray(Images)
print "Images reshaped and loaded in a numpy array.."

Loading Images
Images reshaped and loaded in a numpy array..


In [6]:
import scipy.io

print "Saving Image array"
matfile = 'Images.mat'
scipy.io.savemat(matfile, mdict={'out': Images}, oned_as='row')
print "Data Saved"

Saving Image array
Data Saved


#### A sample of the data

In [None]:
anns = train_annotations[0]
ann = refexp.loadAnns(anns[0])[0]
print ann

In [None]:
img = train_images[0][0]
I = io.imread(os.path.join(imagesDir, imagesType, img['file_name']))
plt.imshow(I)
ax = plt.axis('off')
ann = refexp.returnAnn(ann)
print "---------------------------------------------------------------------------"
print ann[0]

### Generating list of Captions

In [5]:
text = []
print "Generating list of captions.."
for anns in train_annotations:
    ann = refexp.loadAnns(anns[0])[0]
    text.append(refexp.returnAnn(ann)[0])
final_text = []
print "Adding START and END Tokens.."
for sentence in text:
    final_text.append("START " + sentence + " END")
# print len(final_text)

Generating list of captions..
Adding START and END Tokens..


In [9]:
from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence

print "Preprocessing begin"

words = [txt.split() for txt in final_text]
unique = []
for word in words:
    unique.extend(word)
unique = list(set(unique))
print len(unique)
print "Section One ended..."

word_index = {}
index_word = {}
for i,word in enumerate(unique):
    word_index[word] = i
    index_word[i] = word
print "Section two ended..."

partial_captions = []
for text in final_text:
    one = [word_index[txt] for txt in text.split()]
    partial_captions.append(one)
partial_captions = sequence.pad_sequences(partial_captions, maxlen=40,padding='post')
print "Section three ended..."


next_words = np.zeros((16208,10000))
for i,text in enumerate(final_text):
    text = text.split()
    x = [word_index[txt] for txt in text]
    x = np.asarray(x)
    next_words[i,x] = 1

print "Data preprocessing done"

Preprocessing begin
8258
Section One ended...
Section two ended...
Section three ended...
Data preprocessing done


In [10]:
print partial_captions.shape
print next_words.shape

(16208, 40)
(16208, 10000)


In [12]:
import scipy.io
print "Saving partial_Captions and next word array"
matfile = 'partial_cap.mat'
matfile2 = 'next_words.mat'
scipy.io.savemat(matfile, mdict={'out': partial_captions}, oned_as='row')
scipy.io.savemat(matfile2, mdict={'out': next_words}, oned_as='row')
print "Data Saved"

Saving partial_Captions and next word array
Data Saved


### Define the Model

In [7]:
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout, RepeatVector
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import TimeDistributed
from keras.layers import Merge
from keras.optimizers import SGD
from keras.layers.core import Activation
import numpy as np
import h5py

Using Theano backend.
Using gpu device 0: GeForce GTX 760 (CNMeM is disabled, cuDNN 5105)


In [8]:
print "Preparing VGG-16 Model.."

model = Sequential()
model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))
model.add(Convolution2D(64, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(64, 3, 3, activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))

model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(128, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(128, 3, 3, activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))

model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))

model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))

model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(MaxPooling2D((2,2), strides=(2,2)))

model.add(Flatten())
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1000, activation="softmax"))

model.load_weights('vgg16_weights.h5')
for layer in model.layers:
    layer.trainable=False

sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss='categorical_crossentropy')

print "Model prepared"

Preparing VGG-16 Model..
Model prepared


In [11]:
print "Forming Image Features.."
img_features = model.predict(Images[:10])
print "Image Features formed.."

Forming Image Features..


MemoryError: Error allocating 411041792 bytes of device memory (out of memory).
Apply node that caused the error: GpuAllocEmpty(Shape_i{0}.0, Shape_i{0}.0, Elemwise{Composite{((i0 - (((i1 - i2) * i3) + i2)) + i2)}}[(0, 1)].0, Elemwise{Composite{((i0 - (((i1 - i2) * i3) + i2)) + i2)}}[(0, 1)].0)
Toposort index: 131
Inputs types: [TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar)]
Inputs shapes: [(), (), (), ()]
Inputs strides: [(), (), (), ()]
Inputs values: [array(32), array(64), array(224), array(224)]
Outputs clients: [[GpuDnnConv{algo='small', inplace=True}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConvDesc{border_mode='valid', subsample=(1, 1), conv_mode='conv', precision='float32'}.0, Constant{1.0}, Constant{0.0})]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [None]:
vocab_size = 10000
max_caption_len = 40

print "Preparing Language Model."
language_model = Sequential()
language_model.add(Embedding(vocab_size,128, input_length=max_caption_len))
language_model.add(LSTM(output_dim=128, return_sequences=True))
language_model.add(TimeDistributed(Dense(128)))
language_model.output
print "Language Model set."

In [None]:
# model.add(Flatten())
# model.add(Dense(4096, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(4096, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(1000, activation="softmax"))
print "Repeating the Image Features.."
model.add(RepeatVector(max_caption_len))

In [None]:
print "Merging Language and Image Model"
full_model = Sequential()
full_model.add(Merge([model, language_model], mode='concat', concat_axis=-1))
full_model.add(LSTM(512, return_sequences=False))
full_model.add(Dense(30158))
full_model.add(Activation('softmax'))
print "Models merged..."

In [None]:
full_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
# full_model.summary()
full_model.fit([Images[:5000], partial_captions[:5000]],next_words[:5000],batch_size=16,nb_epoch=10)

## 5 D Vector

In [None]:
from __future__ import print_function

with h5py.File('data.h5', 'w') as hf:
    hf.create_dataset('dataset_1', data=Images)
    hf.create_dataset('dataset_2', data=partial_captions)

In [None]:
print model.summary()