Permalink
Find file
a9db6f5 Nov 2, 2015
Ryan Kiros all code
362 lines (294 sloc) 10.1 KB
"""
Joint image-sentence embedding space
"""
import theano
import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import cPickle as pkl
import numpy
import nltk
from collections import OrderedDict, defaultdict
from scipy.linalg import norm
def load_model(path_to_model):
"""
Load all model components
"""
# Load the worddict
with open('%s.dictionary.pkl'%path_to_model, 'rb') as f:
worddict = pkl.load(f)
# Create inverted dictionary
word_idict = dict()
for kk, vv in worddict.iteritems():
word_idict[vv] = kk
word_idict[0] = '<eos>'
word_idict[1] = 'UNK'
# Load model options
with open('%s.pkl'%path_to_model, 'rb') as f:
options = pkl.load(f)
# Load parameters
params = init_params(options)
params = load_params(path_to_model, params)
tparams = init_tparams(params)
# Extractor functions
trng = RandomStreams(1234)
trng, [x, x_mask], sentences = build_sentence_encoder(tparams, options)
f_senc = theano.function([x, x_mask], sentences, name='f_senc')
trng, [im], images = build_image_encoder(tparams, options)
f_ienc = theano.function([im], images, name='f_ienc')
# Store everything we need in a dictionary
model = {}
model['options'] = options
model['worddict'] = worddict
model['word_idict'] = word_idict
model['f_senc'] = f_senc
model['f_ienc'] = f_ienc
return model
def encode_sentences(model, X, verbose=False, batch_size=128):
"""
Encode sentences into the joint embedding space
"""
features = numpy.zeros((len(X), model['options']['dim']), dtype='float32')
# length dictionary
ds = defaultdict(list)
captions = [s.split() for s in X]
for i,s in enumerate(captions):
ds[len(s)].append(i)
# quick check if a word is in the dictionary
d = defaultdict(lambda : 0)
for w in model['worddict'].keys():
d[w] = 1
# Get features. This encodes by length, in order to avoid wasting computation
for k in ds.keys():
if verbose:
print k
numbatches = len(ds[k]) / batch_size + 1
for minibatch in range(numbatches):
caps = ds[k][minibatch::numbatches]
caption = [captions[c] for c in caps]
seqs = []
for i, cc in enumerate(caption):
seqs.append([model['worddict'][w] if d[w] > 0 and model['worddict'][w] < model['options']['n_words'] else 1 for w in cc])
x = numpy.zeros((k+1, len(caption))).astype('int64')
x_mask = numpy.zeros((k+1, len(caption))).astype('float32')
for idx, s in enumerate(seqs):
x[:k,idx] = s
x_mask[:k+1,idx] = 1.
ff = model['f_senc'](x, x_mask)
for ind, c in enumerate(caps):
features[c] = ff[ind]
return features
def encode_images(model, IM):
"""
Encode images into the joint embedding space
"""
images = model['f_ienc'](IM)
return images
def _p(pp, name):
"""
make prefix-appended name
"""
return '%s_%s'%(pp, name)
def init_tparams(params):
"""
initialize Theano shared variables according to the initial parameters
"""
tparams = OrderedDict()
for kk, pp in params.iteritems():
tparams[kk] = theano.shared(params[kk], name=kk)
return tparams
def load_params(path, params):
"""
load parameters
"""
pp = numpy.load(path)
for kk, vv in params.iteritems():
if kk not in pp:
warnings.warn('%s is not in the archive'%kk)
continue
params[kk] = pp[kk]
return params
# layers: 'name': ('parameter initializer', 'feedforward')
layers = {'ff': ('param_init_fflayer', 'fflayer'),
'gru': ('param_init_gru', 'gru_layer')}
def get_layer(name):
fns = layers[name]
return (eval(fns[0]), eval(fns[1]))
def init_params(options):
"""
Initialize all parameters
"""
params = OrderedDict()
# Word embedding
params['Wemb'] = norm_weight(options['n_words'], options['dim_word'])
# Sentence encoder
params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
nin=options['dim_word'], dim=options['dim'])
# Image encoder
params = get_layer('ff')[0](options, params, prefix='ff_image', nin=options['dim_image'], nout=options['dim'])
return params
def build_sentence_encoder(tparams, options):
"""
Encoder only, for sentences
"""
opt_ret = dict()
trng = RandomStreams(1234)
# description string: #words x #samples
x = tensor.matrix('x', dtype='int64')
mask = tensor.matrix('x_mask', dtype='float32')
n_timesteps = x.shape[0]
n_samples = x.shape[1]
# Word embedding
emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])
# Encode sentences
proj = get_layer(options['encoder'])[1](tparams, emb, None, options,
prefix='encoder',
mask=mask)
sents = proj[0][-1]
sents = l2norm(sents)
return trng, [x, mask], sents
def build_image_encoder(tparams, options):
"""
Encoder only, for images
"""
opt_ret = dict()
trng = RandomStreams(1234)
# image features
im = tensor.matrix('im', dtype='float32')
# Encode images
images = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear')
images = l2norm(images)
return trng, [im], images
def linear(x):
"""
Linear activation function
"""
return x
def tanh(x):
"""
Tanh activation function
"""
return tensor.tanh(x)
def l2norm(X):
"""
Compute L2 norm, row-wise
"""
norm = tensor.sqrt(tensor.pow(X, 2).sum(1))
X /= norm[:, None]
return X
def ortho_weight(ndim):
"""
Orthogonal weight init, for recurrent layers
"""
W = numpy.random.randn(ndim, ndim)
u, s, v = numpy.linalg.svd(W)
return u.astype('float32')
def norm_weight(nin,nout=None, scale=0.1, ortho=True):
"""
Uniform initalization from [-scale, scale]
If matrix is square and ortho=True, use ortho instead
"""
if nout == None:
nout = nin
if nout == nin and ortho:
W = ortho_weight(nin)
else:
W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout))
return W.astype('float32')
def xavier_weight(nin,nout=None):
"""
Xavier init
"""
if nout == None:
nout = nin
r = numpy.sqrt(6.) / numpy.sqrt(nin + nout)
W = numpy.random.rand(nin, nout) * 2 * r - r
return W.astype('float32')
# Feedforward layer
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True):
"""
Affine transformation + point-wise nonlinearity
"""
if nin == None:
nin = options['dim_proj']
if nout == None:
nout = options['dim_proj']
params[_p(prefix,'W')] = xavier_weight(nin, nout)
params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32')
return params
def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs):
"""
Feedforward pass
"""
return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')])
# GRU layer
def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
"""
Gated Recurrent Unit (GRU)
"""
if nin == None:
nin = options['dim_proj']
if dim == None:
dim = options['dim_proj']
W = numpy.concatenate([norm_weight(nin,dim),
norm_weight(nin,dim)], axis=1)
params[_p(prefix,'W')] = W
params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
U = numpy.concatenate([ortho_weight(dim),
ortho_weight(dim)], axis=1)
params[_p(prefix,'U')] = U
Wx = norm_weight(nin, dim)
params[_p(prefix,'Wx')] = Wx
Ux = ortho_weight(dim)
params[_p(prefix,'Ux')] = Ux
params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')
return params
def gru_layer(tparams, state_below, init_state, options, prefix='gru', mask=None, one_step=False, **kwargs):
"""
Feedforward pass through GRU
"""
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
dim = tparams[_p(prefix,'Ux')].shape[1]
if init_state == None:
init_state = tensor.alloc(0., n_samples, dim)
if mask == None:
mask = tensor.alloc(1., state_below.shape[0], 1)
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n*dim:(n+1)*dim]
return _x[:, n*dim:(n+1)*dim]
state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
U = tparams[_p(prefix, 'U')]
Ux = tparams[_p(prefix, 'Ux')]
def _step_slice(m_, x_, xx_, h_, U, Ux):
preact = tensor.dot(h_, U)
preact += x_
r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
preactx = tensor.dot(h_, Ux)
preactx = preactx * r
preactx = preactx + xx_
h = tensor.tanh(preactx)
h = u * h_ + (1. - u) * h
h = m_[:,None] * h + (1. - m_)[:,None] * h_
return h
seqs = [mask, state_below_, state_belowx]
_step = _step_slice
if one_step:
rval = _step(*(seqs+[init_state, tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]]))
else:
rval, updates = theano.scan(_step,
sequences=seqs,
outputs_info = [init_state],
non_sequences = [tparams[_p(prefix, 'U')],
tparams[_p(prefix, 'Ux')]],
name=_p(prefix, '_layers'),
n_steps=nsteps,
profile=False,
strict=True)
rval = [rval]
return rval