### Pre-Processsing data
#### Preprocessing Data includes the scaling of images and bounding box to (3,224,224). Followed by extracting VGG features for all of them and then passing the same into the baseline model.

In [3]:
import scipy
%matplotlib inline
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
import matplotlib.patches as mpatches
import sys
import os

# Import Refexp python class
# Please MAKE SURE that ./google_refexp_py_lib is in your
# python library search path
sys.path.append("../data/Google_Refexp_toolbox/google_refexp_py_lib/")
from refexp import Refexp

In [2]:
refexp_filename= '../data/Google_Refexp_toolbox/google_refexp_dataset_release/google_refexp_train_201511_coco_aligned.json'
coco_filename= '../data/Google_Refexp_toolbox/external/coco/annotations/instances_train2014.json'
imagesDir = '../data/Google_Refexp_toolbox/external/coco/images'
imagesType = 'train2014'

# Create Refexp instance.
refexp = Refexp(refexp_filename, coco_filename)

loading annotations into memory...
Done (t=24.45s)
creating index...
index created!
Dataset loaded.


In [4]:
catIds = refexp.getCatIds()
imgIds = []
for i in range(0,10):
    imgIds.append(refexp.getImgIds(catIds=catIds[i]))
imgIds = [item for sublist in imgIds for item in sublist]

print "Loading train images..."
# Returns a tuple of image_ids and array of img_ids
train_images = refexp.loadImgs(imgIds[:20000])
test_images = refexp.loadImgs(imgIds[20000:])

print "Train images loaded.."

Loading train images...
Train images loaded..


In [6]:
ids = train_images[1]
train_annotations = []
for id in ids:
    train_annotations.append(refexp.getAnnIds(id))
print "Training Annotations Loaded.."

Training Annotations Loaded..


In [7]:
annotations = []
print "Generating annotation.."
for anns in train_annotations:
    annotations.append(refexp.loadAnns(anns[0])[0])

Generating annotation..


In [8]:
region_candidates = []
for img in train_images[0]:
    candidates = refexp.getRegionCandidates(img)
    fiveDVec = np.zeros(shape=(len(candidates),5))
    i = 0
    for candidate in candidates:
        if candidate['bounding_box'][0] < 0 or candidate['bounding_box'][1] < 0 or candidate['bounding_box'][2] < 0 or candidate['bounding_box'][3] < 0:
            i = i+1
            continue
        x_tl = [candidate['bounding_box'][0]]
        y_tl = [candidate['bounding_box'][1]+candidate['bounding_box'][3]]
        x_br = [candidate['bounding_box'][0]+candidate['bounding_box'][2]]
        y_br = [candidate['bounding_box'][1]]
        area = [(candidate['bounding_box'][2]*candidate['bounding_box'][3]*1.0)/(224*224)]
        if area == 0:
            i = i+1
            continue
        fiveDVec[i] = np.concatenate([x_tl,y_tl,x_br,y_br,area])
        i = i+1
    region_candidates.append(fiveDVec)

In [9]:
region_candidates = region_candidates[:16200]
region_candidates = np.asarray(region_candidates)
print region_candidates.shape

(16200,)


#### Rescaling the bounding boxes to the input shape of the model

In [10]:
import cv2
i=0
PROPOSALS = []
for candidates in region_candidates:
    region_proposals = []
    j = 0
    count = 0
    for candidate in candidates:
        try:
            if candidate[0]== 0 and candidate[1]== 0 and candidate[2]== 0 and candidate[3]== 0:
                j = j+1
                continue
            I = io.imread(os.path.join(imagesDir, imagesType, img['file_name']))
            sub_img = I[candidate[3]*1.0:candidate[1]*1.0, candidate[0]*1.0:candidate[2]*1.0]
#             print sub_img.shape[0], sub_img.shape[1]
            if sub_img.shape[0] > sub_img.shape[1]:
                fac = 224.0/sub_img.shape[0]
                res = cv2.resize(sub_img, None, fx=fac, fy=fac, interpolation = cv2.INTER_CUBIC)
                res = cv2.copyMakeBorder(res,0,0,0,(224-res.shape[1]),cv2.BORDER_CONSTANT,value=0)
            else:
                fac = 224.0/sub_img.shape[1]
                res = cv2.resize(sub_img, None, fx=fac, fy=fac, interpolation = cv2.INTER_CUBIC)
                res = cv2.copyMakeBorder(res,0,(224-res.shape[0]),0,0,cv2.BORDER_CONSTANT,value=0)
            res.resize(3,224,224)
            res = np.asarray(res)
            region_proposals.append(res)
            j = j+1
            count = count +1
            if count > 5:
                break
        except:
            print i,j
            print candidate
            j = j+1
            pass
    i = i+1
    PROPOSALS.append(region_proposals)



22 3
[  6.12000000e+02   1.79000000e+02   6.27000000e+02   1.60000000e+02
   5.68000638e-03]
56 1
[  3.00000000e+00   6.39000000e+02   6.20000000e+01   5.89000000e+02
   5.87930485e-02]
57 4
[  6.14000000e+02   2.05000000e+02   6.35000000e+02   1.59000000e+02
   1.92522321e-02]
57 6
[  6.24000000e+02   2.20000000e+02   6.41000000e+02   1.52000000e+02
   2.30389031e-02]
84 2
[  5.97000000e+02   2.97000000e+02   6.39000000e+02   1.87000000e+02
   9.20758929e-02]
84 4
[  6.16000000e+02   2.72000000e+02   6.39000000e+02   1.59000000e+02
   5.17976722e-02]
126 3
[  6.18000000e+02   2.93000000e+02   6.41000000e+02   1.94000000e+02
   4.53802615e-02]
126 4
[  6.18000000e+02   2.93000000e+02   6.41000000e+02   1.94000000e+02
   4.53802615e-02]
221 1
[  5.97000000e+02   3.80000000e+02   6.44000000e+02   2.88000000e+02
   8.61766582e-02]
239 5
[  6.08000000e+02   1.72000000e+02   6.33000000e+02   1.44000000e+02
   1.39508929e-02]
275 2
[  6.12000000e+02   3.21000000e+02   6.41000000e+02   2.9800

In [11]:
PROPOSALS = np.asarray(PROPOSALS)
for region_candidate in PROPOSALS:
    region_candidate = np.asarray(region_candidate)

In [None]:
for region_candidates in PROPOSALS:
    for candidates in region_candidates:
        

### Defining Model Parameters

In [1]:
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout, RepeatVector
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import TimeDistributed
from keras.layers import Merge
from keras.optimizers import SGD
from keras.layers.core import Activation
import numpy as np
import h5py

Using Theano backend.
Using gpu device 1: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5105)


In [2]:
image_model = Sequential()
image_model.add(Dense(1000, input_dim=4096, activation="linear"))
image_model.add(RepeatVector(40))
print "Image model made."

Image model made.


In [3]:
vocab_size = 10000
max_caption_len = 40

print "Preparing Language Model."
language_model = Sequential()
language_model.add(Embedding(vocab_size,512, input_length=max_caption_len))
language_model.add(LSTM(output_dim=512, return_sequences=True))
language_model.add(TimeDistributed(Dense(512)))
language_model.output
print "Language Model set."

Preparing Language Model.
Language Model set.


In [4]:
print "Merging Language and Image Model"
full_model = Sequential()
full_model.add(Merge([image_model, language_model], mode='concat', concat_axis=-1))
full_model.add(LSTM(512, return_sequences=False))
full_model.add(Dense(10000))
full_model.add(Activation('softmax'))
print "Models merged..."

Merging Language and Image Model
Models merged...
