In [1]:
torch.setdefaulttensortype('torch.FloatTensor')




### Load `Integral` and test it right away

In [2]:
require 'image'
local lena = image.lena():mean(1):squeeze()[{{1,4}, {1,4}}]

Integral = nil
debug.getregistry()['Integral'] = nil 
package.loaded['Integral-c'] = nil
package.loaded['Integral-c-multi'] = nil
package.loaded['Integral-jit'] = nil
package.loaded['Integral-jit-multi'] = nil
package.loaded['Integral-cuda-multi'] = nil
require 'Integral-jit-multi'

-- compute true forward and backward results for some data
local intGold = Integral(16, 4, 4)
local params, gradParamsGold = intGold:getParameters()

local forwardGold = intGold:forward(lena)
local gradInputGold = intGold:backward(lena, forwardGold)

-- remove the old slow class
Integral = nil
debug.getregistry()['Integral'] = nil 
package.loaded['Integral-c'] = nil
package.loaded['Integral-c-multi'] = nil
package.loaded['Integral-jit'] = nil
package.loaded['Integral-jit-multi'] = nil
package.loaded['Integral-cuda-multi'] = nil

-- require the new fast class
require 'Integral-cuda-multi'

local intTest = Integral(16, 4, 4)
local paramsTest, gradParamsTest = intTest:getParameters()

paramsTest:copy(params)
intTest:recalculateArea()

-- compare results
local forwardTest = intTest:forward(lena)
local forwardErr = (forwardGold - forwardTest):abs():sum() / 
                   forwardTest:nElement() / torch.abs(forwardGold):mean()
print('Output mean relative error:', forwardErr * 100 .. ' %')

local gradInputTest = intTest:backward(lena, forwardGold)
local gradInputErr = (gradInputGold - gradInputTest):abs():sum() / 
                     gradInputTest:nElement() / torch.abs(gradInputGold):mean()
print('gradInput mean relative error:', gradInputErr * 100 .. ' %')
local gradParamsErr = (gradParamsGold - gradParamsTest):abs():sum() / 
                      gradParamsTest:nElement() / torch.abs(gradParamsGold):mean()
print('gradParams mean relative error:', gradParamsErr * 100 .. ' %')

assert(forwardErr    < 1e-6)
assert(gradInputErr  < 1e-6)
assert(gradParamsErr < 7e-4)

Output mean relative error:	2.1069902184093e-06 %	
gradInput mean relative error:	1.8800832663184e-06 %	
gradParams mean relative error:	2.0944284567691e-05 %	


### Test multi-channel vs single-channel `Integral`

Test forward pass

In [3]:
local nInputCh = 5
local nWindows = 7

require 'image'
torch.manualSeed(3)
local input = torch.rand(nInputCh, 234, 234)

Integral = nil
debug.getregistry()['Integral'] = nil 
package.loaded['Integral-c'] = nil
package.loaded['Integral-c-multi'] = nil
package.loaded['Integral-jit'] = nil
package.loaded['Integral-jit-multi'] = nil
package.loaded['Integral-cuda-multi'] = nil
require 'Integral-jit-multi'

-- compute true forward result for some data
local intGold = Integral(nWindows, 234, 234)
local params, gradParamsGold = intGold:getParameters()

local forwardGold = torch.Tensor(nInputCh*nWindows, 234, 234)
for i = 1,nInputCh do
    forwardGold[{{(i-1)*nWindows+1, i*nWindows}}]:copy(intGold:forward(input[i]))
end

-- remove the single-channel class
Integral = nil
debug.getregistry()['Integral'] = nil 
package.loaded['Integral-c'] = nil
package.loaded['Integral-c-multi'] = nil
package.loaded['Integral-jit'] = nil
package.loaded['Integral-jit-multi'] = nil
package.loaded['Integral-cuda-multi'] = nil

-- require the new multi-channel class
require 'Integral-cuda-multi'

local intTest = Integral(nWindows, 234, 234)
for i,member in ipairs{'xMin','xMax','yMin','yMax'} do
    intTest[member] = intGold[member]
end
intTest:recalculateArea()

local forwardTest = intTest:forward(input)

local forwardErr = (forwardGold - forwardTest):abs():sum() / 
                   forwardTest:nElement() / torch.abs(forwardGold):mean()
print('Output mean relative error:', forwardErr * 100 .. ' %')

Output mean relative error:	6.3928183699611e-07 %	


### Gradcheck

In [4]:
nParam = (not nParam or nParam == 5) and 1 or nParam + 1
print(nParam)

1	


In [9]:
Integral = nil
debug.getregistry()['Integral'] = nil 
package.loaded['Integral-c'] = nil
package.loaded['Integral-c-multi'] = nil
package.loaded['Integral-jit'] = nil
package.loaded['Integral-jit-multi'] = nil
package.loaded['Integral-cuda-multi'] = nil
require 'Integral-c-multi'

local nInputCh = 2
local nWindows = 2
local imSize = 90

torch.manualSeed(666)
input = image.convolve(torch.rand(nInputCh, imSize, imSize):pow(3), 
                                image.gaussian(math.max(1,imSize/3)), 'same')
target = image.convolve(torch.rand(nInputCh*nWindows, imSize, imSize):pow(3), 
                                image.gaussian(math.max(1,imSize/3)), 'same')

int = Integral(nWindows, imSize, imSize)
require 'nn'
net = nn.Sequential()
net:add(int)
net:add(nn.SpatialConvolutionMM(nInputCh*nWindows, 3, 1, 1, 1, 1)) 
net:add(nn.LeakyReLU(0.01))
net:add(Integral(4, imSize, imSize))
net:add(nn.SpatialConvolutionMM(3*4, 8, 1, 1, 1, 1))
net:add(nn.Reshape(8, imSize*imSize))
net:add(nn.Transpose({2, 1}))

target = torch.IntTensor(imSize*imSize)
target:apply(function() return torch.random(8) end)

-- crit = nn.MSECriterion()
crit = nn.CrossEntropyCriterion()

param = {}
loss = {}
dL_dParam_nn = {}
dL_dParam_diff = {}

-- nParam = 1 -- 1,2,3,4 <-> xMax,yMax,xMin,yMin

if -- ***********************
nParam == 1 then int.xMax[1] = 60 elseif
nParam == 2 then int.yMax[1] = 60 elseif
nParam == 3 then int.xMin[1] = -60 elseif
nParam == 4 then int.yMin[1] = -60 end

int:recalculateArea()

local stepSize = 1
local maxStep = 200

for i = 1,maxStep do
    if -- ***************************
    nParam == 1 then param[i] = int.xMax[1] - maxStep + i elseif
    nParam == 2 then param[i] = int.yMax[1] - maxStep + i elseif
    nParam == 3 then param[i] = int.xMin[1] + i - 1 elseif
    nParam == 4 then param[i] = int.yMin[1] + i - 1 elseif
    nParam == 5 then param[i] = -10 + i*stepSize end
    
    loss[i] = -666
    dL_dParam_nn[i] = -666
    dL_dParam_diff[i] = -666
end

for i = 1,#param do
    net:zeroGradParameters()
    
    if -- ***************************
    nParam == 1 then int.xMin[1] = param[i] elseif
    nParam == 2 then int.yMin[1] = param[i] elseif
    nParam == 3 then int.xMax[1] = param[i] elseif
    nParam == 4 then int.yMax[1] = param[i] elseif
    nParam == 5 then input[{1,1,1}] = param[i] end
    
    int:recalculateArea()
    
    pred = net:forward(input)
    currLoss = crit:forward(pred, target)
    dLoss_dOutput = crit:backward(pred, target)
    net:backward(input, dLoss_dOutput)
    
    loss[i] = currLoss
    
    if -- ***************************
    nParam == 1 then dL_dParam_nn[i] = int.gradXMin[1] elseif
    nParam == 2 then dL_dParam_nn[i] = int.gradYMin[1] elseif
    nParam == 3 then dL_dParam_nn[i] = int.gradXMax[1] elseif
    nParam == 4 then dL_dParam_nn[i] = int.gradYMax[1] elseif
    nParam == 5 then dL_dParam_nn[i] = int.gradInput[{1,1,1}] end
    
    -- step forward a bit
--     local innerStepSize = 1
--     if -- ***************************
--     nParam == 1 then int.xMin[1] = int.xMin[1] + innerStepSize elseif
--     nParam == 2 then int.yMin[1] = int.yMin[1] + innerStepSize elseif
--     nParam == 3 then int.xMax[1] = int.xMax[1] + innerStepSize elseif
--     nParam == 4 then int.yMax[1] = int.yMax[1] + innerStepSize elseif
--     nParam == 5 then input[{1,1,1}] = param[i] + innerStepSize end
    
--     int:recalculateArea()
    
--     pred = net:forward(input)
--     currLoss = crit:forward(pred, target)
--     dLoss_dOutput = crit:backward(pred, target)
--     net:backward(input, dLoss_dOutput)
    
--     dL_dParam_diff[i] = (currLoss - loss[i]) / innerStepSize
    
    collectgarbage()
    
    if i % 50 == 0 then print(i) end
end

for i = 1,#param-1 do
    dL_dParam_diff[i] = (loss[i+1] - loss[i]) / stepSize
end

param[#param] = nil
dL_dParam_nn[#dL_dParam_nn] = nil
dL_dParam_diff[#dL_dParam_diff] = nil
loss[#loss] = nil

50	


100	


150	


200	


In [10]:
require 'gnuplot'

gnuplot.figure()
gnuplot.plot{
    {'manual', torch.Tensor(param), torch.Tensor(dL_dParam_nn), '-'},
    {'diff'  , torch.Tensor(param), torch.Tensor(dL_dParam_diff), '-'},
--     {'zero'  , torch.Tensor{0, 0}, torch.Tensor{torch.Tensor(dL_dParam_diff):min(), torch.Tensor(dL_dParam_diff):max()}, '-'},
--     {'loss'  , torch.Tensor(param), torch.Tensor(loss), '-'},
}
gnuplot.movelegend('right', 'middle')
gnuplot.xlabel('parameter')
gnuplot.ylabel('dLoss / dParameter')

### Measure execution times

Input: 1024x768 image

* **Forward** experiment: compute `16x1024x768` feature maps 4 times
* **Backward** experiment: do a backprop step (from the "after `Forward`" state) 4 times

#### Forward

* $2.993 \pm 0.242$ sec. (LuaJIT loop + exact fractional parts computation)	
* $0.487 \pm 0.087$ sec. (LuaJIT loop)
* $0.358 \pm 0.019$ sec. (C loop)
* $0.321 \pm 0.037$ sec. (+ precomputed `t,b,l,r`)
* $0.165 \pm 0.058$ sec. (+ parallel)
* $0.140 \pm 0.066$ sec. (+ inline `areaCoeff` multiplication)


* $0.03152 \pm 0.0084$ sec. (CUDA, 'single' kernel)
* $0.03481 \pm 0.0081$ sec. (CUDA, 'multi' kernel, block size = 1x32x32)
* $0.08079 \pm 0.0123$ sec. (CUDA, 'multi' kernel, block size = 4x16x16)

#### Backward

* $2.660 \pm 0.252$ sec. (LuaJIT)
* $1.956 \pm 0.385$ sec. (C parallel `updGI`)
* $1.616 \pm 0.274$ sec. (+ C gradParam loop, precomputed `t,b,l,r`)
* $1.357 \pm 0.136$ sec. (+ parallelize by deltas -- 2 threads)
* $1.991 \pm 0.358$ sec. (+ parallelize by deltas -- 4 threads)
* $1.576 \pm 0.290$ sec. (+ parallelize by rows)

In [4]:
Integral = nil
debug.getregistry()['Integral'] = nil 
package.loaded['Integral-c'] = nil
package.loaded['Integral-c-multi'] = nil
package.loaded['Integral-jit'] = nil
package.loaded['Integral-jit-multi'] = nil
package.loaded['Integral-cuda-multi'] = nil
require 'Integral-cuda-multi'

In [5]:
h = 768
w = 1024
nMaps = 16

In [6]:
int = Integral(16, h, w):cuda()
params, gradParams = int:getParameters()

#### Forward experiment

In [9]:
img = torch.rand(h, w):cuda()
local repeats = 4
local timeRepeats = 20

int:forward(img)

local times = torch.Tensor(timeRepeats)

for timeRepeat = 1,timeRepeats do

    local timer = torch.Timer()

    for _ = 1,repeats do
        int:forward(img)
    end

    cutorch.synchronize()
    timer:stop()

    times[timeRepeat] = timer:time().real
    
    collectgarbage()
end

print(times:mean() .. ' +/- ' .. 2.1 * times:std() .. ' seconds')

0.080793416500092 +/- 0.012302900309995 seconds	


#### Backward experiment

In [22]:
img = torch.rand(1, h, w)
local repeats = 4
local timeRepeats = 15

times = torch.Tensor(timeRepeats)

int:forward(img)
int:backward(img, int.output)

for timeRepeat = 1,timeRepeats do

    local timer = torch.Timer()

    for _ = 1,repeats do
        int:backward(img, int.output)
    end

    timer:stop()

    times[timeRepeat] = timer:time().real
    
    collectgarbage()
    
end

print(times:mean() .. ' +/- ' .. 2.1 * times:std() .. ' seconds')

3.0365828037262 +/- 0.74377151392453 seconds	
