Permalink
Switch branches/tags
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
539 lines (448 sloc) 20.4 KB
# general library
import numpy
from math import ceil, sqrt
# theano library
import theano
from theano import tensor as T
from theano.tensor.nnet import conv2d
try:
from theano.tensor.signal import pool
except ImportError:
pool = None
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
# others
from NetworkHiddenLayer import _NoOpLayer
from ActivationFunctions import strtoact
from cuda_implementation.FractionalMaxPoolingOp import fmp
from theano.sandbox.cuda import dnn
class CNN(_NoOpLayer):
recurrent = True
def __init__(self, n_features=1, filter=1, d_row=-1, border_mode="valid",
conv_stride=(1,1), pool_size=(1,1), filter_dilation=(1,1), ignore_border=1,
pool_stride=0, pool_padding=(0,0), mode="max",
activation="tanh", dropout=0.0, factor=1.0, base = None, transpose=False,
force_sample=False, **kwargs):
"""
:param int n_features: integer
the number of feature map(s), e.g. 32, 64, or so on.
the input will be interpret as (width|time, batch, height * n_in_features) and
the output will be (width|time, batch, height * n_features).
:param int|(int,int) filter: integer or tuple of length 2
the filter size/shape, i.e. the number of row(s) and/or columns(s) from the filter shape.
when this filter type is integer, it means the number of rows the same as the number of columns.
e.g. 3, 5, (1,3), or so on.
:param int d_row: integer
the number of row(s) from the input
the default value is -1, which the dimension comes from the n_out of the input.
otherwise, this has to be filled only for the first convolutional layer and
the rest layer will use the number of rows from the previous layer.
:param str border_mode: string
"valid" -- only apply filter to complete patches of the image.
Generates output of shape: (image_shape - filter_shape + 1).
"full" -- zero-pads image to multiple of filter shape to generate output of shape: (image_shape + filter_shape - 1).
"same" -- keep the dimension of convolutional layer output the same as the input dimension.
:param (int,int) conv_stride: tuple of length 2
factor by which to subsample the convolutional layer output.
this stride is writen in (rows,columns).
:param (int,int) pool_size: tuple of length 2
factor by which to downscale in pooling layer.
this is written in (rows,columns).
the default value is (2,2), it will halve the input in each dimension.
:param (int,int) filter_dilation: tuple of length 2
factor by which to subsample (stride) the convolutional layer input.
:param int|bool ignore_border: integer or boolean
1 or True -- (5, 5) input with pool_size = (2, 2), will generate a (2, 2) pooling layer output.
0 or False -- (5, 5) input with pool_size = (2, 2), will generate a (3, 3) pooling layer output.
:param (int,int) pool_stride: tuple of length 2
stride size, which is the number of shifts over rows/cols to get the next pool region.
the default value is 0, it will set equal to pool_size, which means no overlap on pooling regions.
:param (int,int) pool_padding: tuple of length 2
pad zeros to extend beyond four borders of the images.
this is writen in (pad_h,pad_w), where pad_h is the size of the top and bottom margins, and pad_w is the size of the left and right margins.
:param str mode: string
pooling layer mode that excludes the padding in the computation.
"max" -- max pooling
"sum" -- sum pooling
"avg" -- average pooling
"fmp" -- fractional max pooling
:param str activation: string
activation function, e.g. "tanh", "sigmoid", "relu", "elu", "maxout", and so on.
:param float factor: float
factor by which scale the initial weights
"""
super(CNN, self).__init__(**kwargs)
self.base = base
src = self.sources
self.transpose = transpose
self.status = self.get_status(src) # [is_conv_layer, n_sources]
self.is_1d = self.layer_class == "conv_1d"
is_resnet = self.layer_class == "resnet"
dimension = src[0].attrs["n_out"] # input dimension
if self.status[0]: # if the previous layer is convolution layer
stack_size = src[0].attrs["n_features"] # set stack size from the number of feature maps of previous layer
d_row = src[0].attrs["d_row"]
dimension /= stack_size
# check whether the number of inputs is more than 1 and not resnet for inception
if self.status[1] != 1 and (not is_resnet):
# check the spatial dimension of all inputs
assert all((s.attrs["n_out"] / s.attrs["n_features"]) == dimension
for s in src), \
"The spatial dimension of all inputs have to be the same!"
stack_size = sum([s.attrs["n_features"] for s in src]) # set the stack_size by concatenating feature maps
else: # not convolution layer
stack_size = 1 # set stack_size of first convolution layer as channel of the image (gray scale image)
if self.is_1d: # for processing entire image at once
if d_row == -1:
d_row = dimension
else:
stack_size = dimension / d_row
elif d_row == -1:
d_row = int(sqrt(dimension))
assert self.status[1] == 1, "Except CNN, the input is only one!"
# calculate the width of input
d_col = dimension/d_row
# set filter size to tuple
if type(filter) == int:
filter = [filter, filter]
if filter == [1, 1]:
border_mode = "valid"
# set attributes for resnet
if is_resnet:
n_features = src[1].attrs['n_features']
border_mode = "same"
pool_size = [1, 1]
assert ignore_border == 0 or ignore_border == 1, "Ignore border only accept true (1) or false (0)"
# use no-overlap pooling
if pool_stride == 0:
pool_stride = pool_size
# calculate the dimension of CNN output
_, new_d_row = self.get_dim(d_row, filter[0], pool_size[0],
border_mode, conv_stride[0],
pool_stride[0], ignore_border,
pool_padding[0])
border_mode, new_d_col = self.get_dim(d_col, filter[1], pool_size[1],
border_mode, conv_stride[1],
pool_stride[1], ignore_border,
pool_padding[1])
assert (mode == "max" or mode == "sum" or
mode == "avg" or mode == "fmp"), "invalid pooling mode!"
if mode == "fmp":
new_d_row = int(ceil(new_d_row))
new_d_col = int(ceil(new_d_col))
assert (new_d_row > 0), "invalid spatial rows dimensions!"
n_out = new_d_row * n_features
if not self.is_1d:
assert (new_d_col > 0), "invalid spatial columns dimensions!"
n_out *= new_d_col
# filter shape is tuple/list of length 4 which is (nb feature maps, stack size, filter row, filter col)
self.filter_shape = (n_features, stack_size, filter[0], filter[1])
self.filter_dilation = filter_dilation
self.input_shape = [d_row, d_col]
self.modes = [border_mode, ignore_border, mode, activation]
self.pool_params = [pool_size, pool_stride, pool_padding, conv_stride]
self.other_params = [dropout, factor]
self.force_sample = force_sample
# set attributes
self.set_attr("n_features", n_features)
self.set_attr("d_row", new_d_row) # number of output row
self.set_attr("n_out", n_out) # number of output dimension
def get_status(self, sources):
n_sources = len(sources)
is_conv_layer = all(s.layer_class in ("conv", "frac_conv", "conv_1d", "resnet")
for s in sources)
return [is_conv_layer, n_sources]
def get_dim(self, input, filters, pools, border_mode, stride, pool_stride, ignore_border, pad):
if border_mode == "valid":
result = (input - filters + 1)
elif border_mode == "full":
result = (input + filters - 1)
elif border_mode == "same":
border_mode = "half"
result = input
else:
assert False, "Invalid border_mode"
if stride != 1:
result = int(ceil(result/float(stride)))
result = (result - (pools - pool_stride)) / float(pool_stride)
result = int(result) if ignore_border else int(ceil(result))
result += (2 * pad)
return border_mode, result
def calculate_index(self, inputs):
if inputs.ndim == 3: # TBD
return T.set_subtensor(
inputs[((numpy.int8(1) - self.index.flatten()) > 0).nonzero()],
T.zeros_like(inputs[0])
)
else: # assume BFHW
B = inputs.shape[0]
inputs = inputs.dimshuffle(3, 0, 1, 2) # WBFH
inputs = self.calculate_index(
inputs.reshape(
(inputs.shape[0] * inputs.shape[1],
inputs.shape[2],
inputs.shape[3])
)
)
return inputs.reshape((inputs.shape[0] / B, B, inputs.shape[1],
inputs.shape[2])).dimshuffle(1, 2, 3, 0)
def calculate_dropout(self, dropout, inputs):
assert dropout < 1.0, "Dropout have to be less than 1.0"
mass = T.constant(1.0 / (1.0 - dropout), dtype="float32")
random = RandomStreams(self.rng.randint(1234) + 1)
if self.train_flag:
inputs = inputs * T.cast(
random.binomial(n=1, p=1 - dropout, size=inputs.shape),
theano.config.floatX
)
else:
inputs = inputs * mass
return inputs
def convolution(self, inputs, filter_shape, stride, border_mode, factor, pool_size, filter_dilation):
fan_in = numpy.prod(filter_shape[1:]) # stack_size * filter_row * filter_col
fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(pool_size))
# (n_features * (filter_row * filter_col)) / (pool_size[0] * pool_size[1])
W_bound = numpy.sqrt(6. / (fan_in + fan_out)) * factor
if self.base:
#W = self.base[0].W
W = self.add_param(self.base[0].W)
else:
W = self.add_param(
self.shared(
value=numpy.asarray(
self.rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
dtype='float32'
),
borrow=True,
name="W_conv_" + self.name
)
)
self.W = W
if self.transpose:
op = T.nnet.abstract_conv.AbstractConv2d_gradInputs(
imshp=inputs.shape,
kshp=W.shape,
subsample=stride, border_mode=border_mode,
filter_flip=False)
conv_out = op(W, inputs, inputs[2:])
else:
conv_out = conv2d(
input=inputs,
filters=W,
filter_shape=filter_shape,
filter_dilation=filter_dilation,
subsample=stride,
border_mode=border_mode
)
conv_out.name = "conv_out_" + self.name
conv_out = self.calculate_index(conv_out)
return conv_out
def pooling(self, inputs, pool_size, ignore_border, stride, pad, mode):
if pool_size == [1, 1]:
return inputs
if mode == "avg":
mode = "average_exc_pad"
if mode == "fmp":
height = inputs.shape[2]
width = inputs.shape[3]
batch = inputs.shape[0]
X = inputs.dimshuffle(2, 3, 0, 1) # (row, col, batches, filters)
sizes = T.zeros((batch, 2))
sizes = T.set_subtensor(sizes[:, 0], height)
sizes = T.set_subtensor(sizes[:, 1], width)
pooled_out, _ = fmp(X, sizes, pool_size[0])
return pooled_out.dimshuffle(2, 3, 0, 1)
pool_out = pool.pool_2d(
input=inputs,
ds=pool_size, # TODO(theano 0.9): change to ws
ignore_border=ignore_border,
st=stride, # TODO(theano 0.9): change to stride
padding=pad, # TODO(theano 0.9): change to pad
mode=mode
)
pool_out.name = "pool_out_"+self.name
return pool_out
def bias_term(self, inputs, n_features, activation):
if self.base:
#b = self.base[0].b
b = self.add_param(self.base[0].b)
else:
b = self.add_param(
self.shared(
value=numpy.zeros((n_features,), dtype='float32'),
borrow=True,
name="b_conv_" + self.name
)
)
self.b = b
act = strtoact('identity') if activation == 'maxout' else strtoact(activation)
output = act(inputs + b.dimshuffle("x", 0, "x", "x")) # (time*batch, filter, out-row, out-col)
output.name = "output_bias_term_"+self.name
output = self.calculate_index(output)
return output
def run_cnn(self, inputs, filter_shape, filter_dilation, params, modes, others):
# dropout
if others[0] > 0.0:
inputs = self.calculate_dropout(others[0], inputs)
conv_out = self.convolution(inputs, filter_shape, params[3], modes[0], others[1], params[0], filter_dilation)
pool_out = self.pooling(conv_out, params[0], modes[1], params[1], params[2], modes[2])
if self.is_1d:
self.index = self.pooling(self.index.dimshuffle(1, 'x', 0),
[1, params[0][1]],
modes[1],
params[1],
params[2],
modes[2]).dimshuffle(2, 0, 1).flatten(2)
output = self.bias_term(pool_out, filter_shape[0], modes[3])
return output
class NewConv(CNN):
layer_class = "conv"
'''
this class is for standard CNN and inception
'''
def __init__(self, **kwargs):
super(NewConv, self).__init__(**kwargs)
# our CRNN input is 3D tensor that consists of (time, batch, dim)
# however, the convolution function only accept 4D tensor which is (batch size, stack size, nb row, nb col)
# therefore, we should convert our input into 4D tensor
inputs = self.sources[0].output # (time, batch, input-dim = row * col * stack_size)
time = inputs.shape[0]
batch = inputs.shape[1]
if self.status[0]: # the previous layer is convolutional layer
self.input = T.concatenate([s.Output for s in self.sources], axis=1) # (batch, stack size, row, col)
else:
# In case of spliced data, the last dim in inputs contains stacked frames (e.g. ASR).
# Since Theano reshape will read _and_ write elements row-wise, we need to transpose the target matrix.
# This is done by swapping target dimensions (reshape(.., input_shape[1], input_shape[0]) and subsequent
# dimshuffle that puts row and col dim where Theano expects them.
inputs2 = inputs.reshape((time * batch, self.input_shape[1],
self.input_shape[0], self.filter_shape[1])) # (time*batch, row, col, stack)
self.input = inputs2.dimshuffle(0, 3, 2, 1) # (batch, stack_size, row, col)
self.input.name = "conv_layer_input_final"
if self.modes[3] != "tanh":
act = strtoact(self.modes[3])
self.modes[3] = "identity"
self.Output = self.run_cnn(
inputs=self.input,
filter_shape=self.filter_shape,
filter_dilation=self.filter_dilation,
params=self.pool_params,
modes=self.modes,
others=self.other_params
) # (batch, nb feature maps, out-row, out-col)
# our CRNN only accept 3D tensor (time, batch, dim)
# so, we have to convert back the output to 3D tensor
# self.make_output(self.Output2)
if self.attrs['batch_norm']:
self.Output = self.batch_norm(
h=self.Output.reshape(
(self.Output.shape[0],
self.Output.shape[1] * self.Output.shape[2] * self.Output.shape[3])
),
dim=self.attrs['n_out'],
force_sample=self.force_sample
).reshape(self.Output.shape)
if self.modes[3] != "tanh":
self.Output = act(self.Output)
if self.modes[3] == 'maxout':
self.Output = T.max(self.Output, axis=1).dimshuffle(0, 'x', 1, 2)
self.attrs['n_out'] /= self.attrs['n_features']
self.attrs['n_features'] = 1
output2 = self.Output.dimshuffle(0, 2, 3, 1) # (batch, out-row, out-col, nb feature maps)
self.output = output2.reshape((time, batch, output2.shape[1] * output2.shape[2] * output2.shape[3])) # (time, batch, out-dim)
class ConcatConv(CNN):
layer_class = "conv_1d"
'''
this class is for the CNN that processes an entire line image as the input by concatenated several frames by time axis.
'''
def __init__(self, padding=False, **kwargs):
super(ConcatConv, self).__init__(**kwargs)
inputs = T.concatenate([s.output for s in self.sources], axis=2) # (time, batch, input-dim = row * features)
time = inputs.shape[0]
batch = inputs.shape[1]
if self.status[0]: # the previous layer is convolutional layer
self.input = T.concatenate([s.Output for s in self.sources], axis=3) # (batch, stack_size, row, time)
else:
d_row = kwargs['d_row'] if 'd_row' in kwargs else -1
if d_row == -1:
inputs2 = inputs.reshape((time, batch, inputs.shape[2], self.filter_shape[1])) # (time, batch, row, stack)
else:
inputs2 = inputs.reshape((time, batch, d_row, self.filter_shape[1])) # (time, batch, row, stack)
self.input = inputs2.dimshuffle(1, 3, 2, 0) # (batch, stack_size, row, time)
self.input.name = "conv_layer_input_final"
if self.pool_params[0][1] > 1 and padding:
xp = T.constant(self.pool_params[0][1], 'int32')
self.input = T.concatenate([self.input, T.zeros((batch, self.filter_shape[1], self.input.shape[2],
xp - T.mod(self.input.shape[3], xp)), 'float32')], axis=3)
self.index = T.concatenate([self.index, T.zeros((xp - T.mod(self.index.shape[0], xp), batch), 'int8')], axis=0)
if self.modes[0] == "valid":
if self.filter_shape[3] > 1:
idx = int(self.filter_shape[3] / 2)
self.index = self.index[idx:-idx]
self.Output = self.run_cnn(
inputs=self.input,
filter_shape=self.filter_shape,
filter_dilation=self.filter_dilation,
params=self.pool_params,
modes=self.modes,
others=self.other_params
) # (batch, nb feature maps, out-row, time)
self.Output = self.Output #/ T.cast(self.Output.shape[1],'float32')
if self.attrs['batch_norm']:
if self.base is None:
self.Output = self.batch_norm(
h=self.Output.dimshuffle(0, 2, 3, 1).reshape(
(self.Output.shape[0] * self.Output.shape[2] * self.Output.shape[3],
self.Output.shape[1])
),
dim=self.attrs['n_features'],
force_sample=self.force_sample
).reshape((self.Output.shape[0],
self.Output.shape[2],
self.Output.shape[3],
self.Output.shape[1])).dimshuffle(0, 3, 1, 2)
else:
self.Output = self.batch_norm(
h=self.Output.dimshuffle(0, 2, 3, 1).reshape(
(self.Output.shape[0] * self.Output.shape[2] * self.Output.shape[3],
self.Output.shape[1])
),
dim=self.attrs['n_features'],
force_sample=self.force_sample,
sample_mean=self.base[0].sample_mean,
gamma=self.base[0].gamma,
beta=self.base[0].beta
).reshape((self.Output.shape[0],
self.Output.shape[2],
self.Output.shape[3],
self.Output.shape[1])).dimshuffle(0, 3, 1, 2)
# our CRNN only accept 3D tensor (time, batch, dim)
# so, we have to convert back the output to 3D tensor
output2 = self.Output.dimshuffle(3, 0, 1, 2) # (time, batch, nb feature maps, out-row)
self.output = output2.reshape((output2.shape[0], output2.shape[1],
output2.shape[2] * output2.shape[3])) # (time, batch, out-dim)
class ResNet(CNN):
layer_class = "resnet"
'''
this class is for resnet connection.
'''
def __init__(self, **kwargs):
super(ResNet, self).__init__(**kwargs)
assert self.status[1] == 2, "Only accept 2 sources!"
assert self.status[0], "Only accept cnn layers!"
x = self.sources[0]
f_x = self.sources[1]
time = x.output.shape[0]
batch = x.output.shape[1]
self.input = T.add(x.Output, f_x.Output)
self.Output = T.nnet.relu(self.input)
if self.attrs['batch_norm']:
self.Output = self.batch_norm(
h=self.Output.reshape(
(self.Output.shape[0],
self.Output.shape[1] * self.Output.shape[2] * self.Output.shape[3])
),
dim=self.attrs['n_out'],
force_sample=self.force_sample
).reshape(self.Output.shape)
output2 = self.Output.dimshuffle(0, 2, 3, 1) # (time*batch, out-row, out-col, nb feature maps)
self.output = output2.reshape((time, batch, output2.shape[1] * output2.shape[2] * output2.shape[3])) # (time, batch, out-dim)