diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua index 9b24591..830a7e6 100644 --- a/SpatialConvolution.lua +++ b/SpatialConvolution.lua @@ -127,12 +127,13 @@ function SpatialConvolution:createIODescriptors(input) self.pad = {self.padH, self.padW} self.stride = {self.dH, self.dW} - self.convDesc = cudnn.setConvolutionDescriptor( - { padA = self.pad, + self.convDescData = { padA = self.pad, filterStrideA = self.stride, upscaleA = {1,1}, dataType = cudnn.configmap(torch.type(self.weight)) - }) + } + + self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData) -- get output shape, resize output local oSize = torch.IntTensor(4) diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua index c41e7e2..0ba5cd5 100644 --- a/SpatialFullConvolution.lua +++ b/SpatialFullConvolution.lua @@ -47,11 +47,11 @@ function SpatialFullConvolution:createIODescriptors(input) self.pad = {self.padH, self.padW} self.stride = {self.dH, self.dW} - self.convDesc = cudnn.setConvolutionDescriptor( - { padA = self.pad, - filterStrideA = self.stride, - dataType = cudnn.configmap(torch.type(self.weight)) - }) + self.convDescData = { padA = self.pad, + filterStrideA = self.stride, + dataType = cudnn.configmap(torch.type(self.weight)) + } + self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData) -- get output shape, resize output local iwidth = input:size(4) diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua index 87f7775..22400ce 100644 --- a/TemporalConvolution.lua +++ b/TemporalConvolution.lua @@ -37,7 +37,7 @@ function TemporalConvolution:createIODescriptors(input) end function TemporalConvolution:fastest(mode) - self = cudnn.SpatialConvolution.fastest(self,mode) + cudnn.SpatialConvolution.fastest(self,mode) return self end diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua index 64d0925..9a337cc 100644 --- a/VolumetricConvolution.lua +++ b/VolumetricConvolution.lua @@ -43,10 +43,9 @@ function VolumetricConvolution:createIODescriptors(input) if mathtype == 'CUDNN_DATA_HALF' then mathtype = 'CUDNN_DATA_FLOAT' end - self.convDesc = cudnn.setConvolutionDescriptor( - { padA = self.pad, filterStrideA = self.stride, - dataType = mathtype - }) + self.convDescData = { padA = self.pad, filterStrideA = self.stride, + dataType = mathtype } + self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData) local oSize = torch.IntTensor(5) errcheck('cudnnGetConvolutionNdForwardOutputDim', diff --git a/VolumetricFullConvolution.lua b/VolumetricFullConvolution.lua index 8f8bac6..d62b37e 100644 --- a/VolumetricFullConvolution.lua +++ b/VolumetricFullConvolution.lua @@ -45,10 +45,9 @@ function VolumetricFullConvolution:createIODescriptors(input) -- create conv descriptor self.pad = {self.padT, self.padH, self.padW} self.stride = {self.dT, self.dH, self.dW} - self.convDesc = cudnn.setConvolutionDescriptor( - { padA = self.pad, filterStrideA = self.stride, - dataType = cudnn.configmap(torch.type(self.weight)) - }) + self.convDescData = { padA = self.pad, filterStrideA = self.stride, + dataType = cudnn.configmap(torch.type(self.weight))} + self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData) -- get output shape, resize output local iwidth = input:size(5) diff --git a/ffi.lua b/ffi.lua index d5b5f8c..458e382 100644 --- a/ffi.lua +++ b/ffi.lua @@ -1614,10 +1614,10 @@ end -- check cuDNN version cudnn.version = tonumber(cudnn.C.cudnnGetVersion()) -if cudnn.version < 5005 then - error('These bindings are for version 5005 or above, ' +if cudnn.version < 5005 or cudnn.version >= 6000 then + error('These bindings are for CUDNN 5.x (5005 <= cudnn.version > 6000) , ' .. 'while the loaded CuDNN is version: ' .. cudnn.version - .. ' \nAre you using an older version of CuDNN?') + .. ' \nAre you using an older or newer version of CuDNN?') end -- check GPU driver version diff --git a/find.lua b/find.lua index ceca39e..65f7f65 100644 --- a/find.lua +++ b/find.lua @@ -2,7 +2,12 @@ local ffi = require 'ffi' find = {} find.__index = find ---find.verbose=true + +-- default is to get verbose on errors +find.verbose=false +find.verboseError=true +find.verboseFallback=true + -- constants to index array tables below local Fwd, BwdFilter, BwdData = 1, 2, 3 @@ -64,54 +69,19 @@ local bwdDataAlgoNames = { local algoNames = {fwdAlgoNames, bwdFilterAlgoNames, bwdDataAlgoNames} --- this function is here and not in init.lua (and has the suffix) as generic --- getConvolutionDescriptor methood should have native lua tables instead of FFI --- (like setConvolutionDescriptor does, to be used with it) --- However this is counterproductive for the purposes it's used in this module -local function getConvolutionDescriptor_ffi(desc) - local CUDNN_DIM_MAX=8 - local data = { - dim_p = ffi.new('int[1]'), - padA = ffi.new('int[?]', CUDNN_DIM_MAX), - filterStrideA = ffi.new('int[?]', CUDNN_DIM_MAX), - upscaleA = ffi.new('int[?]', CUDNN_DIM_MAX), - mode_p = ffi.new('cudnnConvolutionMode_t[1]'), - math_p = ffi.new('cudnnDataType_t[1]') - } - - local status = cudnn.call('cudnnGetConvolutionNdDescriptor', desc[0], CUDNN_DIM_MAX, - data.dim_p, data.padA, data.filterStrideA, - data.upscaleA, data.mode_p, data.math_p) - if (status ~= ffi.C.CUDNN_STATUS_SUCCESS) then - if find.verbose or find.verboseError then - print("cudnnGetConvolutionNdDescriptor failed: ", tonumber(status)) - return nil - end +local function convDataString(layer) + local info = '' + if layer.convDescData then + local desc = layer.convDescData + info = ' convDesc=[mode : ' .. desc.mode .. ' datatype : ' .. desc.dataType .. ']' end - - data.arrayLength = data.dim_p[0] - data.mode = data.mode_p[0] - data.dataType = data.math_p[0] - return data + return info .. ' hash=' .. layer.autotunerHash end local function verboseCall(layer, f, ...) - if find.verbose then - print("find:verboseCall: calling " .. f .. ", hash: ", layer.autotunerHash) - end local status = cudnn.call(f, ...) if (status ~= ffi.C.CUDNN_STATUS_SUCCESS) and (find.verbose or find.verboseError) then - local prefix = "find:verboseCall:" - print( prefix .. f .. " failed: ", tonumber(status)) - if layer.convDesc then - local desc = getConvolutionDescriptor_ffi(layer.convDesc) - if desc then - print (prefix .. ' conv desc mode : ', desc.mode, ' datatype : ', desc.datatype) - end - end - end - if find.verbose then - print("find:verboseCall: success, " .. f ) + print("\n" .. f .. " failed: ", tonumber(status), convDataString(layer)) end return status end @@ -123,36 +93,39 @@ local function checkedCall(layer, f, ...) local str = ffi.string(cudnn.C.cudnnGetErrorString(status)) error('Error in CuDNN: ' .. str .. ' ('..f..')') end + return status end find.checkedCall = checkedCall local function noFallback(layer) - if find.verbose then - print("find.defaultFallback: verboseCall failed for: ", layer.autotunerHash) + if find.verbose or find.verboseFallback then + print("\nfind.defaultFallback: verboseCall failed for: ", convDataString(layer)) end return false end +local function fallbackWarning(layer, msg) + if find.verbose or find.verboseFallback then + print("\n *** find.verboseFallback: " .. msg .. + "\n *** Falling back to 32-bit math for: " .. convDataString(layer)) + print(" *** [ Set cudnn.find.verboseFallback to false to disable this message ] *** ") + print(" *** [ Alternatively, you may force CUDNN to always operate on CudaHalfTensors via 32-bit float conversion, in Lua: ] ***\n" + .." *** [ cudnn.configureMath({ ['torch.CudaHalfTensor'] = 'CUDNN_DATA_FLOAT'} ] ***") + print(" *** [ Note: result may be faster or slower than native FP16, depending on your GPU and CUDNN operations ] *** ") + end +end + local function defaultFallback(layer, replay) -- read conv descriptor - local convDescData = getConvolutionDescriptor_ffi(layer.convDesc) - - if convDescData and convDescData.dataType == ffi.C.CUDNN_DATA_HALF then - if find.verbose then - if replay then - print("find.defaultFallback: replay for ", layer.autotunerHash) - else - print("find.defaultFallback: no 16-bit float algo found, will try 32 bits for ", layer.autotunerHash) - end - end - -- using direct FFI call, not cudnn.setConvolutionDescriptor, for efficiency and clarity - checkedCall(layer, 'cudnnSetConvolutionNdDescriptor', layer.convDesc[0], - convDescData.arrayLength, - convDescData.padA, - convDescData.filterStrideA, - convDescData.upscaleA, - convDescData.mode, - ffi.C.CUDNN_DATA_FLOAT) + local convDescData = layer.convDescData + if convDescData and convDescData.dataType == "CUDNN_DATA_HALF" then + fallbackWarning(layer, replay + and "16->32 bit fallback replay " + or "No native FP16 algo found, will try 32-bit math") + -- update our record with fallback value + convDescData.dataType = "CUDNN_DATA_FLOAT" + -- update the descriptor in CUDNN + cudnn.setConvolutionDescriptor(convDescData, layer.convDesc) return true else return false @@ -358,6 +331,10 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params) local function callCudnn(layer) local ret = 0 validResults = 0 + if not layer.convDesc or not layer.convDesc[0] then + error("No convDesc set on layer!") + end + if self.algoFamily == FindExFamily then -- query temp workspace size local tempWorkspace, tempWorkspaceSize = cudnn.getSharedWorkspace() @@ -375,6 +352,10 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params) else -- GetFamily: emulate findXXX results layout numPerfResults[0]=1 + perfResults[0].algo = 0 + perfResults[0].memory = 0 + perfResults[0].status = 1 + local algWorkspaceLimit = layer.workspace_limit or (layer.nInputPlane * layer.kH * layer.kW * layer.weight.elementSize()) @@ -382,6 +363,10 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params) cudnn.getHandle(), params[1], params[3], layer.convDesc[0], params[6], algSearchMode, algWorkspaceLimit, algType[findAPI_idx]) + if ret ~= 0 then + return ret + end + local retAlgo = algType[findAPI_idx][0] if find.verbose then print(string.format( @@ -395,6 +380,9 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params) cudnn.getHandle(), params[1], params[3], layer.convDesc[0], params[6], retAlgo, bufSize:data()) + if ret ~= 0 then + return ret + end if find.verbose then print(string.format( "\n" .. getWSAlgos[findAPI_idx] .. ": bufSize: %d, current ws: %d", @@ -427,31 +415,75 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params) local fallback = '' if (useFallback) then fallback = "[FALLBACK]" end print(string.format( - "\n" .. API .. " algo: %s (%d, status: %d), memory: %8d, count: %d" - .. " hash: %45s " .. cacheHit .. fallback, + "\n" .. API .. " algo[%d]: %s (%d, status: %d), time: %.04f, memory: %8d, count: %d" + .. " %s " .. cacheHit .. fallback, + validResults, algoNames[findAPI_idx][cachedAlgo[validResults].algo+1], cachedAlgo[validResults].algo, cachedAlgo[validResults].status, - cachedAlgo[validResults].memory, r, layer.autotunerHash)) + cachedAlgo[validResults].time, cachedAlgo[validResults].memory, r, convDataString(layer))) end end end - if validResults < 1 and find.verbose then - print("Could not find any valid convolution algorithms for sizes: " .. layer.autotunerHash) - -- todo: add case of multi-stream not fitting in size + if validResults < 1 then return 1 end return 0 end + + local function performanceFallback(layer) + -- read conv descriptor + local convDescData = layer.convDescData + + if convDescData and convDescData.dataType == "CUDNN_DATA_HALF" then + local savedResults = cachedAlgo + local savedNum = validResults + cachedAlgo = {} + validResults = 0 + useFallback = true + + -- update our record with fallback value + layer.convDescData.dataType = "CUDNN_DATA_FLOAT" + -- update the descriptor in CUDNN + cudnn.setConvolutionDescriptor(layer.convDescData, layer.convDesc) + -- do the actual call + local status = callCudnn(layer) + -- check if we got better results with float32 + if status == 0 and validResults > 0 and cachedAlgo[1].time < savedResults[1].time then + if find.verbose or find.verboseFallback then + local msg = string.format("find.performanceFallback: found 32-bit float op is faster (%f) than FP16(%f), memory increase: %fM", + cachedAlgo[1].time, savedResults[1].time, + (tonumber(cachedAlgo[1].memory)-tonumber(savedResults[1].memory))/Meg) + fallbackWarning(layer, msg) + end + return + end + -- restore if we didn't + cachedAlgo = savedResults + validResults = savedNum + -- update our record with fallback value + layer.convDescData.dataType = "CUDNN_DATA_HALF" + -- update the descriptor in CUDNN + cudnn.setConvolutionDescriptor(layer.convDescData, layer.convDesc) + + end + end + -- do the actual call local status = callCudnn(layer) if status ~= 0 or validResults < 1 then if self.fallback and self.fallback(layer) then - useFallback = true; + useFallback = true status = callCudnn(layer) - if status ~= 0 or validResults < 1 then - error ("Fallback attempt failed for " .. API .. ', sizes: ' .. layer.autotunerHash) - end + end + -- check again + if status ~= 0 or validResults < 1 then + error (API .. ' failed, sizes: ' .. convDataString(layer)) + end + else + -- if we are running Find or FindEx in native fp16, check if this algo is actiually faster in pseudo + if self.algoFamily ~= GetFamily then + performanceFallback(layer) end end self:store(layer, findAPI_idx, cachedAlgo) @@ -475,9 +507,9 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params) local fallback = "" if (useFallback) then fallback = "[FALLBACK]" end print(string.format( - "\n" .. API .. ": %s(%d)[%d of %d] Workspace: %8fM (current ws size %fM, max: %dM free: %dM) hash: %45s" .. cacheHit .. fallback, + "\n" .. API .. ": %s(%d)[%d of %d] Workspace: %8fM (current ws size %fM, max: %dM free: %dM) %s" .. cacheHit .. fallback, algoNames[findAPI_idx][cachedAlgo[retAlgo].algo+1], cachedAlgo[retAlgo].algo, retAlgo, #cachedAlgo, - tonumber(cachedAlgo[retAlgo].memory)/Meg, curWorkspaceSize/Meg, self.maxWorkspaceSize/Meg, freeMemory/Meg, layer.autotunerHash)) + tonumber(cachedAlgo[retAlgo].memory)/Meg, curWorkspaceSize/Meg, self.maxWorkspaceSize/Meg, freeMemory/Meg, convDataString(layer))) end return cachedAlgo[retAlgo].algo end @@ -513,9 +545,9 @@ end function find:forwardAlgorithm(layer, params) - if layer.fmode then - setupWS(layer, params, layer.fmode, Fwd) - return layer.fmode + if layer.fmode then + setupWS(layer, params, layer.fmode, Fwd) + return layer.fmode end local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' if layer.fastest_mode or cudnn.fastest == true then @@ -526,9 +558,9 @@ end function find:backwardFilterAlgorithm(layer, params) -- Check if we are in "sticky" mode - if layer.bwmode then - setupWS(layer, params, layer.bwmode, BwdFilter) - return layer.bwmode + if layer.bwmode then + setupWS(layer, params, layer.bwmode, BwdFilter) + return layer.bwmode end local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' if layer.fastest_mode or cudnn.fastest == true then @@ -540,9 +572,9 @@ end function find:backwardDataAlgorithm(layer, params) -- Check if we are in "sticky" mode - if layer.bdmode then - setupWS(layer, params, layer.bdmode, BwdData) - return layer.bdmode + if layer.bdmode then + setupWS(layer, params, layer.bdmode, BwdData) + return layer.bdmode end local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' if layer.fastest_mode or cudnn.fastest == true then diff --git a/functional.lua b/functional.lua index e877cec..5385ffb 100644 --- a/functional.lua +++ b/functional.lua @@ -73,11 +73,10 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou filterDimA = {nOutputPlane, nInputPlane, kH, kW}}) -- create a convolution descriptor - local convDesc = cudnn.setConvolutionDescriptor( - { padA = {padH, padW}, + local convDescData = { padA = {padH, padW}, filterStrideA = {strideH, strideW}, dataType = getMathType(weight) } - ); + local convDesc = cudnn.setConvolutionDescriptor(convDescData); -- create input descriptor local iDesc = cudnn.toDescriptor(input) @@ -97,6 +96,7 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou local oDesc = cudnn.toDescriptor(output) local layer = { + convDescData = convDescData, convDesc = convDesc, weight = weight, nInputPlane = nInputPlane, @@ -141,17 +141,17 @@ cudnn.functional.Convolution2D_updateGradInput = function(handle, input, weight, filterDimA = {nOutputPlane, nInputPlane, kH, kW} }) -- create a convolution descriptor - local convDesc = cudnn.setConvolutionDescriptor( - { padA = {padH, padW}, - filterStrideA = {strideH, strideW}, - dataType = getMathType(weight) - } - ); + local convDescData = { padA = {padH, padW}, + filterStrideA = {strideH, strideW}, + dataType = getMathType(weight) + } + local convDesc = cudnn.setConvolutionDescriptor(convDescData); -- create input, output descriptor local iDesc = cudnn.toDescriptor(input) local oDesc = cudnn.toDescriptor(output) local layer = { + convDescData = convDescData, convDesc = convDesc, weight = weight, nInputPlane = nInputPlane, @@ -193,11 +193,10 @@ cudnn.functional.Convolution2D_accGradParameters = function(handle, input, gradW local weightDesc = cudnn.setFilterDescriptor({ dataType = cudnn.typemap[torch.type(input)], filterDimA = {nOutputPlane, nInputPlane, kH, kW}}) -- create a convolution descriptor - local convDesc = cudnn.setConvolutionDescriptor( - { padA = {padH, padW}, - filterStrideA = {strideH, strideW}, - dataType = getMathType(gradWeight) } - ); + local convDescData = { padA = {padH, padW}, + filterStrideA = {strideH, strideW}, + dataType = getMathType(gradWeight) } + local convDesc = cudnn.setConvolutionDescriptor(convDescData); -- create input, output descriptor local iDesc = cudnn.toDescriptor(input) @@ -205,6 +204,7 @@ cudnn.functional.Convolution2D_accGradParameters = function(handle, input, gradW local layer = { convDesc = convDesc, + convDescData = convDescData, weight = gradWeight, nInputPlane = nInputPlane, nOutputPlane = nOutputPlane, diff --git a/init.lua b/init.lua index 6c8abd7..b4ba8eb 100644 --- a/init.lua +++ b/init.lua @@ -16,9 +16,6 @@ cudnn.fastest = false -- Warning: this option is experimental and assumes at least 2 warmup iterations! cudnn.useFindEx = false --- if true, use 'pseudo-fp16' (half storage, float math) even if true fp16 math is available -cudnn.useFloatMathForHalf = false - -- amount of memory to use on 1st iteration for FindEx cudnn.initialWorkspaceBytes = 1024 @@ -209,17 +206,19 @@ end function cudnn.setConvolutionDescriptor(data, desc) - local dim = data.arrayLength or #data.padA - local upscale = data.upscaleA or torch.IntStorage(dim):fill(1) + if not data.arrayLength then data.arrayLength = #data.padA end + if not data.upscaleA then data.upscaleA = torch.IntStorage(data.arrayLength):fill(1) end + if not data.mode then data.mode = 'CUDNN_CROSS_CORRELATION' end + local myDesc = desc or cudnn.createDescriptors( 1, 'struct cudnnConvolutionStruct*[?]', 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor') errcheck('cudnnSetConvolutionNdDescriptor', myDesc[0], - dim, + data.arrayLength, torch.IntTensor(data.padA):data(), torch.IntTensor(data.filterStrideA):data(), - torch.IntTensor(upscale):data(), - data.mode or 'CUDNN_CROSS_CORRELATION', + torch.IntTensor(data.upscaleA):data(), + data.mode, data.dataType) return myDesc end diff --git a/test/test.lua b/test/test.lua index 2b69fa2..46723fc 100644 --- a/test/test.lua +++ b/test/test.lua @@ -11,7 +11,7 @@ local jac = nn.Jacobian local testparams_half = { test_type = 'torch.CudaHalfTensor', precision_forward = 2e-1, - precision_backward = 8, + precision_backward = 10, precision_jac = 1e-3, precision_io = 1e-1, } @@ -131,7 +131,7 @@ function cudnntest.SpatialConvolution() local input = torch.randn(bs,from,inj,ini):cuda() local gradOutput = torch.randn(bs,to,outj,outi):cuda() local sconv = nn.SpatialConvolution(from,to,ki,kj,si,sj):cuda() - local gconv = cast(cudnn.SpatialConvolution(from,to,ki,kj,si,sj)):fastest() + local gconv = cast(cudnn.SpatialConvolution(from,to,ki,kj,si,sj)) gconv.weight:copy(sconv.weight) gconv.bias:copy(sconv.bias) @@ -162,7 +162,7 @@ function cudnntest.SpatialFullConvolution() local input = torch.randn(bs,from,inj,ini):cuda() local gradOutput = torch.randn(bs,to,outj,outi):cuda() local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj):cuda() - local gconv = cast(cudnn.SpatialFullConvolution(from,to,ki,kj,si,sj):cuda():fastest()) + local gconv = cast(cudnn.SpatialFullConvolution(from,to,ki,kj,si,sj):cuda()) gconv.weight:copy(sconv.weight) gconv.bias:copy(sconv.bias) @@ -189,7 +189,7 @@ function cudnntest.TemporalConvolution() local input = torch.randn(bs,ini,inputFrameSize):cuda() local gradOutput = torch.randn(bs,outi,outputFrameSize):cuda() local sconv = nn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si):cuda() - local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si):cuda():fastest()) + local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si):cuda()) gconv.weight:copy(sconv.weight:view(gconv.weight:size())) gconv.bias:copy(sconv.bias) @@ -225,7 +225,7 @@ function cudnntest.TemporalConvolution_padding_batch() local groundweight = sconv.gradWeight local groundbias = sconv.gradBias - local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si,pad_h):cuda():fastest()) + local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si,pad_h):cuda()) gconv.weight:copy(sconv.weight:view(gconv.weight:size())) gconv.bias:copy(sconv.bias) gconv:forward(cast(input)) @@ -330,10 +330,14 @@ function cudnntest.VolumetricFullConvolution() local outk = (ink-1)*sk+kk local scale = math.random() + if testparams.test_type == 'torch.CudaDoubleTensor' then + return + end + local input = torch.randn(bs,from,ink,inj,ini):cuda() local gradOutput = torch.randn(bs,to,outk,outj,outi):cuda() local sconv = nn.VolumetricFullConvolution(from,to,kk,ki,kj,sk,si,sj):cuda() - local gconv = cast(cudnn.VolumetricFullConvolution(from,to,kk,ki,kj,sk,si,sj):cuda():fastest()) + local gconv = cast(cudnn.VolumetricFullConvolution(from,to,kk,ki,kj,sk,si,sj):cuda()) gconv.weight:copy(sconv.weight) gconv.bias:copy(sconv.bias) @@ -936,18 +940,21 @@ mytester:add(cudnntest) cudnn.verbose=false cudnn.find.verbose=false +-- this is the default, keep it for demo of 16->32 bit float fallback +cudnn.find.verboseFallback=true cudnn.useFindEx=false -for i = 1, cutorch.getDeviceCount() do - cudnn.configureMath() +for i = 1, 1 do -- cutorch.getDeviceCount() do - for _, benchmark in ipairs({true, false}) do + for _, benchmark, fast in ipairs({true, false}) do cudnn.benchmark = benchmark --- cudnn.reset() + -- use random fastest() test for non-benchmark case + if not benchmark then cudnn.fastest = tostring(math.random(0,1)) end + local prop = cutorch.getDeviceProperties(i) print('Running test on device: #' .. i .. ' : ' .. prop.name - .. ' with benchmark = ' .. tostring(cudnn.benchmark)) + .. ' with benchmark = ' .. tostring(cudnn.benchmark) .. ' and fastest = ' .. tostring(cudnn.fastest)) cutorch.setDevice(i) @@ -958,14 +965,6 @@ for i = 1, cutorch.getDeviceCount() do print( 'Testing torch.CudaHalfTensor, torch.cudnn fp16 math is : ', cudnn.configmap('torch.CudaHalfTensor' ), ', cutorch.hasFastHalfInstructions() is ', cutorch.hasFastHalfInstructions()) - if cudnn.configmap('torch.CudaHalfTensor') ~= 'CUDNN_DATA_FLOAT' then - print([[ Warning: 32-bit float math is forced for CudaHalfTensor test - even though native fast 16-bit float math is available for this device. - The reason is cudnn convolution algo find methods for fp16 and certain size combinations may fail. - This should be fixed in next release.]]) - cudnn.configureMath({ ['torch.CudaHalfTensor'] = 'CUDNN_DATA_FLOAT'}) - end - testparams = testparams_half mytester:run()