# Optimize 3d conv

First results:   
<pre>
python perf_convs.py --inputs 20 20 20 20 3 --filters 7 3 6 2 3      
Input shape  [20, 20, 20, 20, 3]
Filter shape [7, 3, 6, 2, 3]
Output shape (20, 18, 15, 19, 7)
#Inputs   480000
#Weights     756
#Outputs  718200
#Multiplications 542959200
Theano 3d                 runtime per iteration (  14 iterations): 143.8815ms
Theano 3d2d               runtime per iteration (  78 iterations):  12.9532ms
Python Vectorized         runtime per iteration (  25 iterations):  82.0083ms
Cudamat Vectorized        runtime per iteration (  23 iterations):  87.3997ms
Theano Mul Vectorized     runtime per iteration (  28 iterations):  72.9542ms
GPU Loop                  runtime per iteration (   1 iterations): 20829.8290ms


</pre>

After Cudnn theano 3d2d faster:

<pre>
python perf_convs.py --inputs 20 20 20 20 3 --filters 7 3 6 2 3
Using gpu device 0: GeForce GTX 780
Batches/Filters, rows, columns, times, channels
Input shape  [20, 20, 20, 20, 3]
Filter shape [7, 3, 6, 2, 3]
Output shape (20, 18, 15, 19, 7)
#Inputs   480000
#Weights     756
#Outputs  718200
#Multiplications 542959200

Theano 3d                 runtime per iteration (  14 iterations): 143.8013ms
Theano 3d2d               runtime per iteration ( 286 iterations):   3.5014ms
Python Vectorized         runtime per iteration (  25 iterations):  80.9831ms
Cudamat Vectorized        runtime per iteration (  24 iterations):  86.5586ms
Theano Mul Vectorized     runtime per iteration (  28 iterations):  71.6942ms
GPU Loop                  runtime per iteration (   1 iterations): 20715.0171ms

</pre>

In [1]:
# 80 x 80 x 3 image input with 5 - 40 frames batch size 64, 256
# 5 x  5 x 3 x 5(zeit) filter, 128 davon
# wenn im vergleich yuzu 2d nur um faktor zeit oder ein bisschen mehr groesser dann ok

In [2]:
# wrap as a layer usable in theano:  https://github.com/benanne/Lasagne/blob/master/lasagne/layers/corrmm.py

In [None]:
# or pylearn2

In [25]:
# For given inputs above we can have maximum (filtersxbatchsize)
# http://www.nvidia.de/object/geforce-gtx-780-de.html#pdpContent=2 => 3072 MB RAM
# Keep in mind this is ignoring memory for filters and input and assuming all memory is free before convolution

GPU_memory_bytes = 3072 * (1024**2)
given_output_elements = ((80 - 5 + 1) * (80 - 5 + 1) * (40 - 5 + 1))
float_bytes = 4

maximum_batches_times_filters = GPU_memory_bytes / (given_output_elements * float_bytes)
print "Maximum of batches * filters:", maximum_batches_times_filters
print("32x32?", 32*32 < maximum_batches_times_filters)
print("32x64?", 32*64 < maximum_batches_times_filters)
print("64x64?", 64*64 < maximum_batches_times_filters)

Maximum of batches * filters: 3872
('32x32?', True)
('32x64?', True)
('64x64?', False)


## Debug

In [1]:
import os
os.sys.path.insert(0,'/home/schirrmr/3dconv/') 
%cd /home/schirrmr/3dconv/pylearn3dconv/
%load_ext autoreload
%autoreload 2

/home/schirrmr/3dconv/pylearn3dconv


In [54]:
#!PYTHONPATH=$PYTHONPATH:`pwd`/../ python ipython_scripts.py
!PYTHONPATH=$PYTHONPATH:`pwd`/../../ python ../ipython_scripts.py

Using gpu device 0: GeForce GTX 780
inside conv fwd
returning zero from set tensor
returning zero from set filter
after conv fwd setting tensors/filters
returning zero from set tensor
Handle zu 119899872
Handle d 119899872
Conv algo 3
Input descriptor
Float: 1
nbDims: 5
Dimensions: 3 3 3 3 3 
Strides: 81 27 9 3 1 

Filter descriptor
Float: 1
nbDims: 5
Dimensions: 3 3 3 3 3 

Conv descriptor
Float: 1
conv dims: 3
Paddings: 0 0 0 
filterStrideA: 1 1 1 
upscaleA: 1 1 1 
mode convolution: 1
mode correlation: 0

Output
Float: 1
nbDims: 5
Dimensions: 3 3 1 1 1 
Strides: 3 1 1 1 1 
after cudnnGetTensorNdDescriptor
handle 94159552
after cudnnGetConvolutionForwardWorkspaceSize
worksize requested: 0
Traceback (most recent call last):
  File "../ipython_scripts.py", line 82, in <module>
    result=conv_result_func(real_inputs, real_filters)
  File "/home/schirrmr/3dconv/Theano/theano/compile/function_module.py", line 597, in __call__
    outputs = self.fn()
  File "/home/schirrmr/3dconv/Theano/th

In [None]:
# make test case with this data, check that you always get same results
#
# for pyyaml later:
# create h5 files with data, create "h5volumetricdataset" class which reads out adata and calls superconstructor

In [2]:
!make volumetric_in_c

make: *** No rule to make target `volumetric_in_c'.  Stop.


In [3]:
! ./volumetric_in_c

/bin/sh: 1: ./volumetric_in_c: not found


## Imports

In [2]:
from pylearn3dconv.volumetric_space import Conv3DSpace
from pylearn3dconv.layers.theano_3d_conv import Theano3dConv3dElemwise
import numpy as np
from pylearn2.models.mlp import IdentityConvNonlinearity
import theano
import theano.tensor as T
from pylearn3dconv.volumetric_dense_design_matrix import VolumetricDenseDesignMatrix
from pylearn2.training_algorithms.sgd import SGD
from pylearn2.models.mlp import MLP, Softmax, ConvElemwise
from pylearn2.format.target_format import OneHotFormatter
from numpy.random import RandomState
from pylearn2.space import Conv2DSpace
from pylearn3dconv.layers.blas2d_manuel_conv import ConvElemwiseBlas
from pylearn3dconv.layers.cublas_3d_conv import CuBlasConv3dElemwise
from pylearn3dconv.layers.cudnn_3d_conv import CuDnnConv3dElemwise
from pylearn3dconv.perf.perf_layers import create_fprop_layer_3d_symbolic
from pylearn3dconv.test_data import generate_test_data
import theano.sandbox.cuda
import theano
import theano.sandbox.cuda.dnn as cdnn
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
                                           gpu_contiguous, HostFromGpu,
                                           gpu_alloc_empty)
from theano.sandbox.cuda.dnn import GpuDnnConv, GpuDnnConvDesc
from numpy.random import RandomState
from pylearn3dconv.theano_dnn_first_try.theano_dnn_conv import GpuDnn3dConv, GpuDnnConv3dDesc

import numpy as np
import theano_dnn_first_try.theano_dnn_conv as owndnn
ftensor5 = T.TensorType('float32', (False,)*5)
class FakeMLP():
    def __init__(self,rng,batch_size):
        self.rng = rng
        self.batch_size = batch_size

Using gpu device 0: GeForce GTX TITAN Black


## Pooling Cudnn

In [79]:
from pylearn3dconv.theano_dnn_first_try.theano_dnn_conv import GpuDnnPool3dDesc, GpuDnn3dPool
inputs_shape = [5,3,4,7,2]
inputs_shape = [5,3,4,8,2]

pool_shape = [4,8,2]
pool_shape = [2,2,1]
pool_stride = (2,7,1)
pool_input = ftensor5()
pool_desc = GpuDnnPool3dDesc(tuple(pool_shape),pool_stride, 'max', pad=(0,0,0))()
dnn_3d_pool_result = GpuDnn3dPool()(pool_input, pool_desc)
pool_func = theano.function([pool_input], dnn_3d_pool_result)

INFO (theano.gof.compilelock): Refreshing lock /home/schirrmr/.theano/compiledir_Linux-3.13--generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/lock_dir/lock
INFO:theano.gof.compilelock:Refreshing lock /home/schirrmr/.theano/compiledir_Linux-3.13--generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/lock_dir/lock


In [80]:
inputs = np.ones(inputs_shape).astype(np.float32)
pool_func(inputs)

RuntimeError: GpuDnnPool: error doing cudnnPoolingForward operation: CUDNN_STATUS_NOT_SUPPORTED
Apply node that caused the error: GpuDnn3dPool(GpuFromHost.0, GpuDnnPool3dDesc{ws=(2, 2, 1), stride=(2, 7, 1), mode='max', pad=(0, 0, 0)}.0)
Inputs types: [CudaNdarrayType(float32, 5D), <theano.gof.type.CDataType object at 0x7f0ab292dfd0>]
Inputs shapes: [(5, 3, 4, 8, 2), 'No shapes']
Inputs strides: [(192, 64, 16, 2, 1), 'No strides']
Inputs values: ['not shown', <PyCObject object at 0x7f0ac9dd8d00>]

Backtrace when the node is created:
  File "/home/schirrmr/3dconv/pylearn3dconv/theano_dnn_first_try/theano_dnn_conv.py", line 634, in make_node
    [img.type()])

HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

## Gradient for Cudnn Test 

### Reference Result from Cublas 

In [94]:
# get small input
rng = RandomState(np.uint32(hash('tobiderpuma')))
inputs_shape = [5,8,4,7,3]
inputs_shape = [5,2,4,7,3]
filters_shape = [6,2,3,5,3]

bias *= 0

inputs, filters, bias = generate_test_data(rng, inputs_shape, filters_shape)
# compute gradient for Cublas
# do it twice, compare result
x = T.dscalar('x')
inputs_theano = ftensor5()

conv_result = create_fprop_layer_3d_symbolic(inputs_shape, filters, bias, CuBlasConv3dElemwise, inputs_theano)
cost = T.sum(conv_result)
conv_gradient = T.grad(cost, inputs_theano)
grad_func = theano.function([inputs_theano], conv_gradient)

In [95]:
correct_result = grad_func(inputs)

### Result from manually using Cudnn Op - fine, same result as Cublas if bias is 0

In [76]:
inputs_theano_cudnn_op = ftensor5()
filters_theano_cudnn_op = ftensor5()

x  = gpu_contiguous(inputs_theano_cudnn_op)
filters_cudnn_op = gpu_contiguous(filters_theano_cudnn_op)
desc = GpuDnnConv3dDesc(conv_mode='cross')()
desc_op = desc.owner.op
out_shp = GpuDnn3dConv.get_out_shape(x.shape, filters_cudnn_op.shape,
                                   desc_op.subsample)


out = gpu_alloc_empty(*out_shp)
rval = GpuDnn3dConv()(x, filters_cudnn_op, out, desc)
cost_cudnn_op = T.sum(rval)
conv_dnn_op_gradient = T.grad(cost_cudnn_op, inputs_theano_cudnn_op)
grad_func_cudnn_op = theano.function([inputs_theano_cudnn_op, filters_cudnn_op], conv_dnn_op_gradient)

INFO (theano.gof.compilelock): Refreshing lock /home/schirrmr/.theano/compiledir_Linux-3.13--generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/lock_dir/lock
INFO:theano.gof.compilelock:Refreshing lock /home/schirrmr/.theano/compiledir_Linux-3.13--generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/lock_dir/lock


In [77]:
cudnn_op_result = grad_func_cudnn_op(inputs, filters)
assert np.sum(np.square(cudnn_op_result - correct_result)) < 1e-4

RuntimeError: GpuDnn3dConvGradI: error doing operation: CUDNN_STATUS_BAD_PARAM
Apply node that caused the error: GpuDnn3dConvGradI{inplace=False}(<CudaNdarrayType(float32, 5D)>, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConv3dDesc{subsample=(1, 1, 1), conv_mode='cross'}.0, Constant{1.0}, Constant{0.0})
Inputs types: [CudaNdarrayType(float32, 5D), CudaNdarrayType(float32, 5D), CudaNdarrayType(float32, 5D), <theano.gof.type.CDataType object at 0x7feac419a550>, Scalar(float32), Scalar(float32)]
Inputs shapes: [(6, 2, 3, 5, 3), (5, 6, 2, 3, 1), (5, 8, 4, 7, 3), 'No shapes', (), ()]
Inputs strides: [(90, 45, 15, 3, 1), (36, 6, 3, 1, 0), (672, 84, 21, 3, 1), 'No strides', (), ()]
Inputs values: ['not shown', 'not shown', 'not shown', <PyCObject object at 0x7feac48a82b0>, 1.0, 0.0]

Backtrace when the node is created:
  File "/home/schirrmr/3dconv/pylearn3dconv/theano_dnn_first_try/theano_dnn_conv.py", line 457, in make_node
    def infer_shape(self, node, shape):

HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

###  Result from using 2d dnn op - fine, same as reference if bias is 0

In [41]:
inputs_theano_dnn2d = T.ftensor4()
filters_theano_dnn2d = T.ftensor4()

x  = gpu_contiguous(inputs_theano_dnn2d)
filters_theano_dnn2d = gpu_contiguous(filters_theano_dnn2d)
desc = GpuDnnConvDesc(border_mode='valid', conv_mode='cross')(x.shape, filters_theano_dnn2d.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv.get_out_shape(x.shape, filters_theano_dnn2d.shape, 'valid',
                                   desc_op.subsample)


out = gpu_alloc_empty(*out_shp)
rval = GpuDnnConv()(x, filters_theano_dnn2d, out, desc)
cost_dnn2d_op = T.sum(rval)
conv_dnn2d_op_gradient = T.grad(cost_dnn2d_op, inputs_theano_dnn2d)
grad_func_dnn2d_op = theano.function([inputs_theano_dnn2d, filters_theano_dnn2d], conv_dnn2d_op_gradient)

INFO (theano.gof.compilelock): Refreshing lock /home/schirrmr/.theano/compiledir_Linux-3.13--generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/lock_dir/lock
INFO:theano.gof.compilelock:Refreshing lock /home/schirrmr/.theano/compiledir_Linux-3.13--generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/lock_dir/lock


In [50]:
dnn2d_result = grad_func_dnn2d_op(inputs[:,:,:,0,:], filters[:,:,:,0,:])
print np.sum(np.square(dnn2d_result - correct_result[:,:,:,0,:]))

0.0


### Layer - all fine same as cublas layer unless 0 dimension filter and input shape is equal 

In [96]:
inputs_theano_cudnn = ftensor5()
inputs_theano_cudnn_contiguous = gpu_contiguous(inputs_theano_cudnn)
conv_result_cudnn = create_fprop_layer_3d_symbolic(inputs_shape, filters, bias, CuDnnConv3dElemwise, 
                                                   inputs_theano_cudnn_contiguous)
cost_cudnn = T.sum(conv_result_cudnn)
cost_cudnn = gpu_contiguous(cost_cudnn)
conv_dnn_gradient = T.grad(cost_cudnn, inputs_theano_cudnn)
grad_func_cudnn = theano.function([inputs_theano_cudnn], conv_dnn_gradient)

INFO (theano.gof.compilelock): Refreshing lock /home/schirrmr/.theano/compiledir_Linux-3.13--generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/lock_dir/lock
INFO:theano.gof.compilelock:Refreshing lock /home/schirrmr/.theano/compiledir_Linux-3.13--generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/lock_dir/lock


In [97]:
cudnn_result = grad_func_cudnn(inputs)
assert np.sum(np.square(cudnn_result- correct_result)) < 1e-4

RuntimeError: GpuDnn3dConvGradI: error doing operation: CUDNN_STATUS_NOT_SUPPORTED
Apply node that caused the error: GpuDnn3dConvGradI{inplace=False}(GpuContiguous.0, GpuContiguous.0, GpuAllocEmpty.0, GpuDnnConv3dDesc{subsample=(1, 1, 1), conv_mode='cross'}.0, Constant{1.0}, Constant{0.0})
Inputs types: [CudaNdarrayType(float32, 5D), CudaNdarrayType(float32, 5D), CudaNdarrayType(float32, 5D), <theano.gof.type.CDataType object at 0x7feac0485cd0>, Scalar(float32), Scalar(float32)]
Inputs shapes: [(6, 3, 2, 3, 5), (5, 6, 1, 2, 3), (5, 3, 2, 4, 7), 'No shapes', (), ()]
Inputs strides: [(90, 30, 15, 5, 1), (36, 6, 0, 3, 1), (168, 56, 28, 7, 1), 'No strides', (), ()]
Inputs values: ['not shown', 'not shown', 'not shown', <PyCObject object at 0x7feadcfdfd78>, 1.0, 0.0]

Backtrace when the node is created:
  File "/home/schirrmr/3dconv/pylearn3dconv/theano_dnn_first_try/theano_dnn_conv.py", line 457, in make_node
    def infer_shape(self, node, shape):

HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

## CuDnn 3d Theano 

## Theano Memory Leak replication 

In [8]:
import theano.sandbox.cuda
print("Free memory at start {:5.1f} MB".format(
        theano.sandbox.cuda.mem_info()[0] / (1024.0 ** 2)))
print("Total memory {:5.1f} MB".format(
        theano.sandbox.cuda.mem_info()[1] / (1024.0 ** 2)))
# 2983.2 MB before import cell above
# 2984.2 MB after import cell above -> all ok

Free memory at start 2984.2 MB
Total memory 3071.3 MB


In [19]:
from pylearn3dconv.perf.perf_layers import generate_2d_3d_test_data
rng = RandomState(hash('tobipuma') % 4294967295)
inputs_shape = [12,20,20,14,3]
filters_shape = [12,5,5,5,3]
inputs, filters, bias, inputs_2d, filters_2d = generate_2d_3d_test_data(
    rng, inputs_shape, filters_shape)   


print("Free memory at start {:5.1f} MB".format(
        theano.sandbox.cuda.mem_info()[0] / (1024.0 ** 2)))

Free memory at start 2975.2 MB


In [20]:
from pylearn3dconv.perf.perf_layers import compute_2d_reference_result
reference_result2d = compute_2d_reference_result(inputs_2d,filters_2d,bias)

print("Free memory after compute reference result {:5.1f} MB".format(
        theano.sandbox.cuda.mem_info()[0] / (1024.0 ** 2)))

Input shape: (20, 20)
Detector space: (16, 16)
Output space: (16, 16)
Free memory after compute reference result 2975.2 MB


In [21]:
del inputs_2d
del filters_2d
del bias
del reference_result2d
print("Free memory after del reference result {:5.1f} MB".format(
        theano.sandbox.cuda.mem_info()[0] / (1024.0 ** 2)))

Free memory after del reference result 2975.2 MB


##Test Conv2d Layer Convolution replication 

In [72]:
from pylearn3dconv.test_data import generate_test_data
rng = RandomState(hash('tobipuma') % 4294967295)
inputs_shape = [15, 6, 9, 11, 3]
filters_shape = [12, 2, 4, 5, 3]
inputs, filters, bias = generate_test_data(rng, inputs_shape, filters_shape)
mlp = FakeMLP(rng=np.random,batch_size=inputs_shape[0])

In [109]:
def create_fprop(conv_layer_class, inputs_shape, filters, bias, mlp):

    conv_2d_layer = conv_layer_class(output_channels=filters.shape[0], 
        kernel_shape=filters.shape[1:3], tied_b=True,
        layer_name='conv_lin', nonlinearity=IdentityConvNonlinearity(),
        irange=0.001)
    conv_2d_layer.set_mlp(mlp)
    conv_2d_layer.set_input_space(Conv2DSpace(shape=inputs_shape[1:3], 
        num_channels=inputs_shape[4]))
    converted_weights = Conv2DSpace.convert_numpy(filters[:,:,:,0,:], 
                                                  conv_2d_layer.input_space.axes, 
                                                  conv_2d_layer.detector_space.axes)

    conv_2d_layer.set_weights(converted_weights)
    conv_2d_layer.set_biases(bias)
    inputs_2d_theano = T.ftensor4()
    conv2d_result = conv_2d_layer.fprop(inputs_2d_theano)
    conv2d = theano.function([inputs_2d_theano], conv2d_result)
    return conv2d, conv_2d_layer

In [110]:
conv_2d, conv_2d_layer = create_fprop(ConvElemwise, inputs.shape, filters, bias, mlp)

Input shape: (6, 9)
Detector space: (5, 6)
Output space: (5, 6)


In [116]:
print np.sum(np.square(conv_2d_layer.get_biases()-conv_2d_blas_layer.get_biases()))
# same!
print np.sum(np.square(conv_2d_layer.get_params()[0].get_value()-
                       conv_2d_blas_layer.get_params()[0].get_value()))
# still same
# so one is doing correlation other convolution...so flip filters???!?!

0.0
0.0


In [126]:
input_shape_2d = inputs.shape[0:3] + (inputs.shape[4],)
conv_2d_blas, conv_2d_blas_layer = create_fprop(ConvElemwiseBlas, inputs.shape, filters[:,::-1,::-1,:,:], bias, mlp)

result1= conv_2d(np.ones(input_shape_2d).astype(np.float32))
result2= conv_2d_blas(np.ones(input_shape_2d).astype(np.float32))
result1= conv_2d(inputs[:,:,:,0,:])
result2= conv_2d_blas(inputs[:,:,:,0,:])

print np.sum(np.square(result2-result1))

1.46139e-09


## Test Theano Cudnn Op 3d Convolution

In [6]:

inputs = ftensor5()
filters = ftensor5()

desc = owndnn.GpuDnnConv3dDesc(subsample=(1,1,1))()

forward_conv = 1
desc_op = desc.owner.op
out_shp = owndnn.GpuDnn3dConv.get_out_shape(inputs.shape, filters.shape,
                                   desc_op.subsample)

out = gpu_alloc_empty(*out_shp)

conv_result = owndnn.GpuDnn3dConv()(inputs, filters, out, desc)

conv_result_func = theano.function([inputs, filters], conv_result, mode='DebugMode')
"""
real_inputs = np.random.normal(size=(5,3,4,3,1)).astype(np.float32)
real_filters = np.random.normal(size=(2,3,3,2,1)).astype(np.float32)
result=conv_result_func(real_inputs, real_filters)
print np.array(result)
"""

'\nreal_inputs = np.random.normal(size=(5,3,4,3,1)).astype(np.float32)\nreal_filters = np.random.normal(size=(2,3,3,2,1)).astype(np.float32)\nresult=conv_result_func(real_inputs, real_filters)\nprint np.array(result)\n'

In [7]:
real_inputs = np.random.normal(size=(5,3,4,3,1)).astype(np.float32)
real_filters = np.random.normal(size=(2,3,3,2,1)).astype(np.float32)
result=conv_result_func(real_inputs, real_filters)
print np.array(result)

[[[[[  1.3052038 ]
    [  0.99738848]]

   [[  1.23546088]
    [ -3.78463674]]]


  [[[  0.95624417]
    [  3.17768621]]

   [[  0.66182953]
    [  4.60149765]]]]



 [[[[ -4.51677465]
    [ -6.0400176 ]]

   [[ 10.51604843]
    [  5.51716518]]]


  [[[ -1.98940551]
    [  5.68342638]]

   [[  2.21714044]
    [ -1.55560589]]]]



 [[[[ -4.65615368]
    [ -4.28107023]]

   [[  5.39695978]
    [  2.28894544]]]


  [[[  4.28638983]
    [ -4.13960791]]

   [[  1.95399821]
    [ -4.17457628]]]]



 [[[[ -5.53674126]
    [ -6.08783388]]

   [[  0.37000543]
    [-12.15050125]]]


  [[[  3.32547235]
    [ -6.08842087]]

   [[  7.22971964]
    [  2.42332625]]]]



 [[[[ -8.05591202]
    [ 11.65596104]]

   [[ -0.72672451]
    [  1.53100789]]]


  [[[ -5.61389637]
    [  0.24655201]]

   [[  7.72962809]
    [  0.32530639]]]]]


## Test for theano op convolution

In [42]:
inputs = T.ftensor4()
filters = T.ftensor4()

desc = cdnn.GpuDnnConvDesc('valid')(inputs.shape, filters.shape)

forward_conv = 1
desc_op = desc.owner.op
out_shp = cdnn.GpuDnnConv.get_out_shape(inputs.shape, filters.shape,
                                   desc_op.border_mode,
                                   desc_op.subsample)

out = gpu_alloc_empty(*out_shp)

conv_result = cdnn.GpuDnnConv()(inputs, filters, out, desc)

conv_result_func = theano.function([inputs, filters], conv_result, mode='DebugMode')

In [45]:
real_inputs = np.random.normal(size=(5,3,4,1)).astype(np.float32)
real_filters = np.random.normal(size=(2,3,3,1)).astype(np.float32)

In [46]:
result=conv_result_func(real_inputs, real_filters)
print np.array(result)

[[[[-1.81879711]
   [ 2.37883425]]

  [[ 1.00554168]
   [ 1.36972821]]]


 [[[-0.50348598]
   [-1.81900835]]

  [[-1.91241026]
   [ 0.72988558]]]


 [[[-4.87137651]
   [ 1.78511786]]

  [[-4.22570562]
   [ 3.57148552]]]


 [[[-0.41075268]
   [ 0.05635083]]

  [[ 0.44945204]
   [ 4.61057091]]]


 [[[ 1.76157069]
   [-4.53335381]]

  [[-0.19954003]
   [-5.59845734]]]]


### Old stuff 

In [2]:
from pylearn3dconv.volumetric_space import Conv3DSpace
from pylearn3dconv.layers.theano_3d_conv import Theano3dConv3dElemwise
import numpy as np
from pylearn2.models.mlp import IdentityConvNonlinearity
import theano
import theano.tensor as T
from pylearn3dconv.volumetric_dense_design_matrix import VolumetricDenseDesignMatrix
from pylearn2.training_algorithms.sgd import SGD
from pylearn2.models.mlp import MLP, Softmax
from pylearn2.format.target_format import OneHotFormatter
from numpy.random import RandomState
class FakeMLP():
    def __init__(self,rng,batch_size):
        self.rng = rng
        self.batch_size = batch_size

Using gpu device 0: GeForce GTX 780


In [71]:

fake_mlp = FakeMLP(rng=rng,batch_size=inputs_shape[0])
conv_3d_layer = Theano3dConv3dElemwise(output_channels=filters_shape[0], 
        kernel_shape=filters_shape[1:4],
        layer_name='conv3d_lin', nonlinearity=IdentityConvNonlinearity(),
        irange=0.001)
conv_3d_layer.set_mlp(fake_mlp)
conv_3d_input_space = Conv3DSpace(inputs_shape[1:4], num_channels=inputs_shape[4], axes=('b',0,1,2,'c'))
conv_3d_layer.set_input_space(conv_3d_input_space)


inputs_3d_layer_theano = ftensor5()
conv3d__layer_result = conv_3d_layer.fprop(inputs_3d_layer_theano)
conv3d_fprop = theano.function([inputs_3d_layer_theano], conv3d__layer_result)

inputs_3d_nnet_theano = ftensor5()
filters_3d_nnet_theano = ftensor5()
bias_3d_nnet_theano = T.fvector()
conv3d_nnet_result = theano.tensor.nnet.conv3D(inputs_3d_nnet_theano, filters_3d_nnet_theano, bias_3d_nnet_theano, d=(1,1,1))
conv3d_nnet = theano.function([inputs_3d_nnet_theano, filters_3d_nnet_theano, bias_3d_nnet_theano], conv3d_nnet_result)

inputs = (rng.rand(*inputs_shape).astype('float32') -1) * 2 # [-1,1)
filters = (rng.rand(*filters_shape).astype('float32') - 1) *2 # [-1,1)
bias = rng.rand(filters.shape[0]).astype('float32')
targets = np.zeros(inputs.shape[0]).astype('int')
targets[::2] = 1 # every second target is 1 or 0
inputs[targets == 1] = inputs[targets == 1] + 1


correct_result = conv3d_nnet(inputs, filters, bias)
conv_3d_layer.set_weights(filters)
conv_3d_layer.set_biases(bias)
layer_result = conv3d_fprop(inputs)

assert np.sum(np.square(correct_result - layer_result)) < 1e-4

target_formatter = OneHotFormatter(2)
targets_one_hot = target_formatter.format(targets)

conv_3d_layer.mlp = None
softmax_layer = Softmax(max_col_norm=2, layer_name='y', n_classes=2, istdev=.05)
mlp = MLP(input_space=conv_3d_input_space, layers=[conv_3d_layer, softmax_layer])
train_set = VolumetricDenseDesignMatrix(topo_view=inputs[0:50], y=targets_one_hot[0:50], axes=('b', 0, 1, 2, 'c'))
valid_set = VolumetricDenseDesignMatrix(topo_view=inputs[50:75], y=targets_one_hot[50:75], axes=('b', 0, 1, 2, 'c'))
test_set = VolumetricDenseDesignMatrix(topo_view=inputs[75:100], y=targets_one_hot[75:100], axes=('b', 0, 1, 2, 'c'))
algorithm = SGD(batch_size=20, learning_rate=0.1)
algorithm.setup(mlp, train_set)

NameError: name 'rng' is not defined

In [7]:
mlp_valid_result = mlp_fprop(valid_set.get_topological_view())
mlp_valid_result_labels = np.argmax(mlp_valid_result, axis=1)
mlp_test_result = mlp_fprop(test_set.get_topological_view())
mlp_test_result_labels = np.argmax(mlp_test_result, axis=1)
print("valid", np.sum(np.equal(np.argmax(valid_set.y, axis=1), mlp_valid_result_labels)) / float(len(valid_set.y)))
print("test", np.sum(np.equal(np.argmax(test_set.y, axis=1), mlp_test_result_labels)) / float(len(test_set.y)))

('valid', 0.88)
('test', 0.95999999999999996)


In [5]:
inputs_mlp_theano = ftensor5()
mlp_fprop_result = mlp.fprop(inputs_mlp_theano)

mlp_fprop = theano.function([inputs_mlp_theano], mlp_fprop_result)
mlp_result = mlp_fprop(train_set.get_topological_view())
mlp_result_labels = np.argmax(mlp_result, axis=1)
y_labels = np.argmax(train_set.y, axis=1)
np.sum(np.equal(y_labels, mlp_result_labels)) / float(len(train_set.y))

0.29999999999999999

In [6]:
algorithm.train(train_set)