diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
index 9b24591..830a7e6 100644
--- a/SpatialConvolution.lua
+++ b/SpatialConvolution.lua
@@ -127,12 +127,13 @@ function SpatialConvolution:createIODescriptors(input)
         self.pad = {self.padH, self.padW}
         self.stride = {self.dH, self.dW}
 
-        self.convDesc = cudnn.setConvolutionDescriptor(
-           { padA = self.pad,
+        self.convDescData = { padA = self.pad,
              filterStrideA = self.stride,
              upscaleA = {1,1},
              dataType = cudnn.configmap(torch.type(self.weight))
-           })
+        }
+
+        self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData)
 
         -- get output shape, resize output
         local oSize = torch.IntTensor(4)
diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua
index c41e7e2..0ba5cd5 100644
--- a/SpatialFullConvolution.lua
+++ b/SpatialFullConvolution.lua
@@ -47,11 +47,11 @@ function SpatialFullConvolution:createIODescriptors(input)
         self.pad = {self.padH, self.padW}
         self.stride = {self.dH, self.dW}
 
-        self.convDesc = cudnn.setConvolutionDescriptor(
-           { padA = self.pad,
-             filterStrideA = self.stride,
-             dataType = cudnn.configmap(torch.type(self.weight))
-           })
+        self.convDescData = { padA = self.pad,
+                              filterStrideA = self.stride,
+                              dataType = cudnn.configmap(torch.type(self.weight))
+        }
+        self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData)
 
         -- get output shape, resize output
         local iwidth = input:size(4)
diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua
index 87f7775..22400ce 100644
--- a/TemporalConvolution.lua
+++ b/TemporalConvolution.lua
@@ -37,7 +37,7 @@ function TemporalConvolution:createIODescriptors(input)
 end
 
 function TemporalConvolution:fastest(mode)
-    self = cudnn.SpatialConvolution.fastest(self,mode)
+    cudnn.SpatialConvolution.fastest(self,mode)
     return self
 end
 
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
index 64d0925..9a337cc 100644
--- a/VolumetricConvolution.lua
+++ b/VolumetricConvolution.lua
@@ -43,10 +43,9 @@ function VolumetricConvolution:createIODescriptors(input)
          if mathtype == 'CUDNN_DATA_HALF' then
             mathtype = 'CUDNN_DATA_FLOAT'
          end
-         self.convDesc = cudnn.setConvolutionDescriptor(
-            { padA = self.pad, filterStrideA = self.stride,
-              dataType = mathtype
-            })
+         self.convDescData = { padA = self.pad, filterStrideA = self.stride,
+                               dataType = mathtype }
+         self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData)
 
          local oSize = torch.IntTensor(5)
          errcheck('cudnnGetConvolutionNdForwardOutputDim',
diff --git a/VolumetricFullConvolution.lua b/VolumetricFullConvolution.lua
index 8f8bac6..d62b37e 100644
--- a/VolumetricFullConvolution.lua
+++ b/VolumetricFullConvolution.lua
@@ -45,10 +45,9 @@ function VolumetricFullConvolution:createIODescriptors(input)
          -- create conv descriptor
          self.pad = {self.padT, self.padH, self.padW}
          self.stride = {self.dT, self.dH, self.dW}
-         self.convDesc = cudnn.setConvolutionDescriptor(
-            { padA = self.pad, filterStrideA = self.stride,
-              dataType = cudnn.configmap(torch.type(self.weight))
-            })
+         self.convDescData = { padA = self.pad, filterStrideA = self.stride,
+                               dataType = cudnn.configmap(torch.type(self.weight))}
+         self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData)
 
         -- get output shape, resize output
         local iwidth = input:size(5)
diff --git a/ffi.lua b/ffi.lua
index d5b5f8c..458e382 100644
--- a/ffi.lua
+++ b/ffi.lua
@@ -1614,10 +1614,10 @@ end
 
 -- check cuDNN version
 cudnn.version = tonumber(cudnn.C.cudnnGetVersion())
-if cudnn.version < 5005 then
-  error('These bindings are for version 5005 or above, '
+if cudnn.version < 5005 or cudnn.version >= 6000 then
+   error('These bindings are for CUDNN 5.x (5005 <= cudnn.version > 6000) , '
         .. 'while the loaded CuDNN is version: ' .. cudnn.version
-           .. '  \nAre you using an older version of CuDNN?')
+           .. '  \nAre you using an older or newer version of CuDNN?')
 end
 
 -- check GPU driver version
diff --git a/find.lua b/find.lua
index ceca39e..65f7f65 100644
--- a/find.lua
+++ b/find.lua
@@ -2,7 +2,12 @@ local ffi = require 'ffi'
 
 find = {}
 find.__index = find
---find.verbose=true
+
+-- default is to get verbose on errors
+find.verbose=false
+find.verboseError=true
+find.verboseFallback=true
+
 -- constants to index array tables below
 local Fwd, BwdFilter, BwdData = 1, 2, 3
 
@@ -64,54 +69,19 @@ local bwdDataAlgoNames = {
 
 local algoNames = {fwdAlgoNames, bwdFilterAlgoNames, bwdDataAlgoNames}
 
--- this function is here and not in init.lua (and has the suffix) as generic
--- getConvolutionDescriptor methood should have native lua tables instead of FFI
--- (like setConvolutionDescriptor does, to be used with it)
--- However this is counterproductive for the purposes it's used in this module
-local function getConvolutionDescriptor_ffi(desc)
-   local CUDNN_DIM_MAX=8
-   local data = {
-      dim_p = ffi.new('int[1]'),
-      padA = ffi.new('int[?]', CUDNN_DIM_MAX),
-      filterStrideA = ffi.new('int[?]', CUDNN_DIM_MAX),
-      upscaleA = ffi.new('int[?]', CUDNN_DIM_MAX),
-      mode_p = ffi.new('cudnnConvolutionMode_t[1]'),
-      math_p = ffi.new('cudnnDataType_t[1]')
-   }
-
-   local status = cudnn.call('cudnnGetConvolutionNdDescriptor', desc[0], CUDNN_DIM_MAX,
-                             data.dim_p, data.padA, data.filterStrideA,
-                             data.upscaleA, data.mode_p, data.math_p)
-   if (status ~= ffi.C.CUDNN_STATUS_SUCCESS) then
-      if find.verbose or find.verboseError then
-         print("cudnnGetConvolutionNdDescriptor failed: ", tonumber(status))
-         return nil
-      end
+local function convDataString(layer)
+   local info = ''
+   if layer.convDescData then
+      local desc = layer.convDescData
+      info = ' convDesc=[mode : ' .. desc.mode .. ' datatype : ' .. desc.dataType .. ']'
    end
-
-   data.arrayLength = data.dim_p[0]
-   data.mode =     data.mode_p[0]
-   data.dataType = data.math_p[0]
-   return data
+   return info .. ' hash=' ..  layer.autotunerHash
 end
 
 local function verboseCall(layer, f, ...)
-   if find.verbose then
-        print("find:verboseCall: calling " .. f .. ", hash: ",  layer.autotunerHash)
-   end
    local status = cudnn.call(f, ...)
    if (status ~= ffi.C.CUDNN_STATUS_SUCCESS) and (find.verbose or find.verboseError) then
-      local prefix = "find:verboseCall:"
-      print( prefix .. f .. " failed: ", tonumber(status))
-      if layer.convDesc then
-         local desc = getConvolutionDescriptor_ffi(layer.convDesc)
-         if desc then
-            print (prefix .. ' conv desc mode : ', desc.mode, ' datatype : ', desc.datatype)
-         end
-      end
-   end
-   if find.verbose then
-      print("find:verboseCall: success, " .. f )
+      print("\n" .. f .. " failed: ", tonumber(status), convDataString(layer))
    end
    return status
 end
@@ -123,36 +93,39 @@ local function checkedCall(layer, f, ...)
       local str = ffi.string(cudnn.C.cudnnGetErrorString(status))
       error('Error in CuDNN: ' .. str .. ' ('..f..')')
    end
+   return status
 end
 find.checkedCall = checkedCall
 
 local function noFallback(layer)
-   if find.verbose then
-      print("find.defaultFallback: verboseCall failed for:  ", layer.autotunerHash)
+   if find.verbose or find.verboseFallback then
+      print("\nfind.defaultFallback: verboseCall failed for:  ", convDataString(layer))
    end
    return false
 end
 
+local function fallbackWarning(layer, msg)
+   if find.verbose or find.verboseFallback then
+      print("\n *** find.verboseFallback: " .. msg ..
+            "\n *** Falling back to 32-bit math for: " .. convDataString(layer))
+      print(" *** [ Set cudnn.find.verboseFallback to false to disable this message ] *** ")
+      print(" *** [ Alternatively, you may force CUDNN to always operate on CudaHalfTensors via 32-bit float conversion, in Lua: ] ***\n"
+               .." *** [ cudnn.configureMath({ ['torch.CudaHalfTensor']   = 'CUDNN_DATA_FLOAT'} ] ***")
+      print(" *** [ Note: result may be faster or slower than native FP16, depending on your GPU and CUDNN operations ] *** ")
+   end
+end
+
 local function defaultFallback(layer, replay)
    -- read conv descriptor
-   local convDescData = getConvolutionDescriptor_ffi(layer.convDesc)
-
-   if convDescData and convDescData.dataType == ffi.C.CUDNN_DATA_HALF then
-      if find.verbose then
-         if replay then
-            print("find.defaultFallback: replay for ", layer.autotunerHash)
-         else
-            print("find.defaultFallback: no 16-bit float algo found, will try 32 bits for ", layer.autotunerHash)
-         end
-      end
-      -- using direct FFI call, not cudnn.setConvolutionDescriptor, for efficiency and clarity
-      checkedCall(layer, 'cudnnSetConvolutionNdDescriptor', layer.convDesc[0],
-                  convDescData.arrayLength,
-                  convDescData.padA,
-                  convDescData.filterStrideA,
-                  convDescData.upscaleA,
-                  convDescData.mode,
-                  ffi.C.CUDNN_DATA_FLOAT)
+   local convDescData = layer.convDescData
+   if convDescData and convDescData.dataType == "CUDNN_DATA_HALF" then
+      fallbackWarning(layer, replay
+                         and "16->32 bit fallback replay "
+                         or "No native FP16 algo found, will try 32-bit math")
+      -- update our record with fallback value
+      convDescData.dataType = "CUDNN_DATA_FLOAT"
+      -- update the descriptor in CUDNN
+      cudnn.setConvolutionDescriptor(convDescData, layer.convDesc)
       return true
    else
       return false
@@ -358,6 +331,10 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
            local function callCudnn(layer)
               local ret = 0
               validResults = 0
+              if not layer.convDesc or not layer.convDesc[0] then
+                 error("No convDesc set on layer!")
+              end
+
               if self.algoFamily == FindExFamily then
                  -- query temp workspace size
                  local tempWorkspace, tempWorkspaceSize = cudnn.getSharedWorkspace()
@@ -375,6 +352,10 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
                  else
                     -- GetFamily: emulate findXXX results layout
                     numPerfResults[0]=1
+                    perfResults[0].algo = 0
+                    perfResults[0].memory = 0
+                    perfResults[0].status = 1
+
                     local algWorkspaceLimit = layer.workspace_limit
                        or (layer.nInputPlane * layer.kH * layer.kW * layer.weight.elementSize())
 
@@ -382,6 +363,10 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
                                      cudnn.getHandle(),
                                      params[1], params[3], layer.convDesc[0], params[6],
                                      algSearchMode, algWorkspaceLimit, algType[findAPI_idx])
+                    if ret ~= 0 then
+                       return ret
+                    end
+
                     local retAlgo = algType[findAPI_idx][0]
                     if find.verbose then
                        print(string.format(
@@ -395,6 +380,9 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
                                      cudnn.getHandle(),
                                      params[1], params[3], layer.convDesc[0], params[6],
                                      retAlgo, bufSize:data())
+                    if ret ~= 0 then
+                       return ret
+                    end
                     if find.verbose then
                        print(string.format(
                                 "\n" .. getWSAlgos[findAPI_idx]  .. ": bufSize: %d, current ws: %d",
@@ -427,31 +415,75 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
                        local fallback = ''
                        if (useFallback) then fallback = "[FALLBACK]"  end
                        print(string.format(
-                                "\n" .. API .. " algo: %s (%d, status: %d), memory: %8d, count: %d"
-                                   .. " hash: %45s " .. cacheHit .. fallback,
+                                "\n" .. API .. " algo[%d]: %s (%d, status: %d), time: %.04f, memory: %8d, count: %d"
+                                   .. " %s " .. cacheHit .. fallback,
+                                validResults,
                                 algoNames[findAPI_idx][cachedAlgo[validResults].algo+1], cachedAlgo[validResults].algo,  cachedAlgo[validResults].status,
-                                cachedAlgo[validResults].memory, r, layer.autotunerHash))
+                                cachedAlgo[validResults].time, cachedAlgo[validResults].memory, r, convDataString(layer)))
                     end
                  end
               end
-              if validResults < 1  and find.verbose then
-                 print("Could not find any valid convolution algorithms for sizes: " .. layer.autotunerHash)
-                 -- todo: add case of multi-stream not fitting in size
+              if validResults < 1 then
                  return 1
               end
               return 0
            end
 
+
+           local function performanceFallback(layer)
+              -- read conv descriptor
+              local convDescData = layer.convDescData
+
+              if convDescData and convDescData.dataType == "CUDNN_DATA_HALF" then
+                 local savedResults = cachedAlgo
+                 local savedNum = validResults
+                 cachedAlgo = {}
+                 validResults = 0
+                 useFallback = true
+
+                 -- update our record with fallback value
+                 layer.convDescData.dataType = "CUDNN_DATA_FLOAT"
+                 -- update the descriptor in CUDNN
+                 cudnn.setConvolutionDescriptor(layer.convDescData, layer.convDesc)
+                 -- do the actual call
+                 local status = callCudnn(layer)
+                 -- check if we got better results with float32
+                 if status == 0 and validResults > 0 and cachedAlgo[1].time < savedResults[1].time then
+                    if find.verbose or find.verboseFallback then
+                       local msg = string.format("find.performanceFallback: found 32-bit float op is faster (%f) than FP16(%f), memory increase: %fM",
+                                                 cachedAlgo[1].time, savedResults[1].time,
+                                                 (tonumber(cachedAlgo[1].memory)-tonumber(savedResults[1].memory))/Meg)
+                       fallbackWarning(layer, msg)
+                    end
+                    return
+                 end
+                 -- restore if we didn't
+                cachedAlgo = savedResults
+                validResults = savedNum
+                -- update our record with fallback value
+                layer.convDescData.dataType = "CUDNN_DATA_HALF"
+                -- update the descriptor in CUDNN
+                cudnn.setConvolutionDescriptor(layer.convDescData, layer.convDesc)
+
+              end
+           end
+
            -- do the actual call
            local status = callCudnn(layer)
 
            if status ~= 0 or validResults < 1 then
               if self.fallback and self.fallback(layer) then
-                 useFallback = true;
+                 useFallback = true
                  status = callCudnn(layer)
-                 if status ~= 0  or validResults < 1 then
-                    error ("Fallback attempt failed for " .. API .. ', sizes: ' .. layer.autotunerHash)
-                 end
+              end
+              -- check again
+              if status ~= 0  or validResults < 1 then
+                 error (API .. ' failed, sizes: ' .. convDataString(layer))
+              end
+           else
+              -- if we are running Find or FindEx in native fp16, check if this algo is actiually faster in pseudo
+              if self.algoFamily ~= GetFamily then
+                 performanceFallback(layer)
               end
            end
            self:store(layer, findAPI_idx, cachedAlgo)
@@ -475,9 +507,9 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
            local fallback = ""
            if (useFallback) then fallback = "[FALLBACK]"  end
            print(string.format(
-                    "\n" .. API  .. ": %s(%d)[%d of %d] Workspace: %8fM (current ws size %fM, max: %dM free: %dM)  hash: %45s" .. cacheHit .. fallback,
+                    "\n" .. API  .. ": %s(%d)[%d of %d] Workspace: %8fM (current ws size %fM, max: %dM free: %dM) %s" .. cacheHit .. fallback,
                     algoNames[findAPI_idx][cachedAlgo[retAlgo].algo+1], cachedAlgo[retAlgo].algo, retAlgo, #cachedAlgo,
-                    tonumber(cachedAlgo[retAlgo].memory)/Meg, curWorkspaceSize/Meg, self.maxWorkspaceSize/Meg, freeMemory/Meg, layer.autotunerHash))
+                    tonumber(cachedAlgo[retAlgo].memory)/Meg, curWorkspaceSize/Meg, self.maxWorkspaceSize/Meg, freeMemory/Meg, convDataString(layer)))
         end
         return cachedAlgo[retAlgo].algo
 end
@@ -513,9 +545,9 @@ end
 
 
 function find:forwardAlgorithm(layer, params)
-   if layer.fmode then 
-     setupWS(layer, params, layer.fmode, Fwd) 
-     return layer.fmode 
+   if layer.fmode then
+     setupWS(layer, params, layer.fmode, Fwd)
+     return layer.fmode
    end
    local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
    if layer.fastest_mode or cudnn.fastest == true then
@@ -526,9 +558,9 @@ end
 
 function find:backwardFilterAlgorithm(layer, params)
    -- Check if we are in "sticky" mode
-   if layer.bwmode then 
-     setupWS(layer, params, layer.bwmode, BwdFilter) 
-     return layer.bwmode 
+   if layer.bwmode then
+     setupWS(layer, params, layer.bwmode, BwdFilter)
+     return layer.bwmode
    end
    local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
    if layer.fastest_mode or cudnn.fastest == true then
@@ -540,9 +572,9 @@ end
 
 function find:backwardDataAlgorithm(layer, params)
    -- Check if we are in "sticky" mode
-   if layer.bdmode then 
-     setupWS(layer, params, layer.bdmode, BwdData) 
-     return layer.bdmode 
+   if layer.bdmode then
+     setupWS(layer, params, layer.bdmode, BwdData)
+     return layer.bdmode
    end
    local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
    if layer.fastest_mode  or cudnn.fastest == true then
diff --git a/functional.lua b/functional.lua
index e877cec..5385ffb 100644
--- a/functional.lua
+++ b/functional.lua
@@ -73,11 +73,10 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou
         filterDimA = {nOutputPlane, nInputPlane, kH, kW}})
 
    -- create a convolution descriptor
-   local convDesc = cudnn.setConvolutionDescriptor(
-      { padA = {padH, padW},
+   local convDescData = { padA = {padH, padW},
         filterStrideA = {strideH, strideW},
         dataType = getMathType(weight) }
-   );
+   local convDesc = cudnn.setConvolutionDescriptor(convDescData);
 
     -- create input descriptor
    local iDesc = cudnn.toDescriptor(input)
@@ -97,6 +96,7 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou
    local oDesc = cudnn.toDescriptor(output)
 
    local layer = {
+      convDescData = convDescData,
       convDesc = convDesc,
       weight = weight,
       nInputPlane = nInputPlane,
@@ -141,17 +141,17 @@ cudnn.functional.Convolution2D_updateGradInput = function(handle, input, weight,
         filterDimA = {nOutputPlane, nInputPlane, kH, kW} })
 
    -- create a convolution descriptor
-   local convDesc = cudnn.setConvolutionDescriptor(
-      { padA = {padH, padW},
-        filterStrideA = {strideH, strideW},
-        dataType = getMathType(weight)
-      }
-   );
+   local convDescData = { padA = {padH, padW},
+                          filterStrideA = {strideH, strideW},
+                          dataType = getMathType(weight)
+                        }
+   local convDesc = cudnn.setConvolutionDescriptor(convDescData);
     -- create input, output descriptor
    local iDesc = cudnn.toDescriptor(input)
    local oDesc = cudnn.toDescriptor(output)
 
    local layer = {
+      convDescData = convDescData,
       convDesc = convDesc,
       weight = weight,
       nInputPlane = nInputPlane,
@@ -193,11 +193,10 @@ cudnn.functional.Convolution2D_accGradParameters = function(handle, input, gradW
     local weightDesc =  cudnn.setFilterDescriptor({ dataType = cudnn.typemap[torch.type(input)],
                                                     filterDimA = {nOutputPlane, nInputPlane, kH, kW}})
     -- create a convolution descriptor
-    local convDesc = cudnn.setConvolutionDescriptor(
-       { padA = {padH, padW},
-         filterStrideA = {strideH, strideW},
-         dataType = getMathType(gradWeight) }
-    );
+    local convDescData = { padA = {padH, padW},
+                           filterStrideA = {strideH, strideW},
+                           dataType = getMathType(gradWeight) }
+    local convDesc = cudnn.setConvolutionDescriptor(convDescData);
 
     -- create input, output descriptor
     local iDesc = cudnn.toDescriptor(input)
@@ -205,6 +204,7 @@ cudnn.functional.Convolution2D_accGradParameters = function(handle, input, gradW
 
    local layer = {
       convDesc = convDesc,
+      convDescData = convDescData,
       weight = gradWeight,
       nInputPlane = nInputPlane,
       nOutputPlane = nOutputPlane,
diff --git a/init.lua b/init.lua
index 6c8abd7..b4ba8eb 100644
--- a/init.lua
+++ b/init.lua
@@ -16,9 +16,6 @@ cudnn.fastest = false
 -- Warning: this option is experimental and assumes at least 2 warmup iterations!
 cudnn.useFindEx = false
 
--- if true, use 'pseudo-fp16' (half storage, float math) even if true fp16 math is available
-cudnn.useFloatMathForHalf = false
-
 -- amount of memory to use on 1st iteration for FindEx
 cudnn.initialWorkspaceBytes = 1024
 
@@ -209,17 +206,19 @@ end
 
 
 function cudnn.setConvolutionDescriptor(data, desc)
-   local dim  = data.arrayLength or #data.padA
-   local upscale = data.upscaleA or torch.IntStorage(dim):fill(1)
+   if not data.arrayLength then data.arrayLength = #data.padA end
+   if not data.upscaleA then data.upscaleA =  torch.IntStorage(data.arrayLength):fill(1) end
+   if not data.mode then data.mode = 'CUDNN_CROSS_CORRELATION' end
+
    local myDesc = desc or cudnn.createDescriptors(
       1, 'struct cudnnConvolutionStruct*[?]',
       'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor')
    errcheck('cudnnSetConvolutionNdDescriptor', myDesc[0],
-            dim,
+            data.arrayLength,
             torch.IntTensor(data.padA):data(),
             torch.IntTensor(data.filterStrideA):data(),
-            torch.IntTensor(upscale):data(),
-            data.mode or 'CUDNN_CROSS_CORRELATION',
+            torch.IntTensor(data.upscaleA):data(),
+            data.mode,
             data.dataType)
    return myDesc
 end
diff --git a/test/test.lua b/test/test.lua
index 2b69fa2..46723fc 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -11,7 +11,7 @@ local jac = nn.Jacobian
 local testparams_half = {
    test_type = 'torch.CudaHalfTensor',
    precision_forward = 2e-1,
-   precision_backward = 8,
+   precision_backward = 10,
    precision_jac = 1e-3,
    precision_io = 1e-1,
 }
@@ -131,7 +131,7 @@ function cudnntest.SpatialConvolution()
    local input = torch.randn(bs,from,inj,ini):cuda()
    local gradOutput = torch.randn(bs,to,outj,outi):cuda()
    local sconv = nn.SpatialConvolution(from,to,ki,kj,si,sj):cuda()
-   local gconv = cast(cudnn.SpatialConvolution(from,to,ki,kj,si,sj)):fastest()
+   local gconv = cast(cudnn.SpatialConvolution(from,to,ki,kj,si,sj))
    gconv.weight:copy(sconv.weight)
    gconv.bias:copy(sconv.bias)
 
@@ -162,7 +162,7 @@ function cudnntest.SpatialFullConvolution()
    local input = torch.randn(bs,from,inj,ini):cuda()
    local gradOutput = torch.randn(bs,to,outj,outi):cuda()
    local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj):cuda()
-   local gconv = cast(cudnn.SpatialFullConvolution(from,to,ki,kj,si,sj):cuda():fastest())
+   local gconv = cast(cudnn.SpatialFullConvolution(from,to,ki,kj,si,sj):cuda())
    gconv.weight:copy(sconv.weight)
    gconv.bias:copy(sconv.bias)
 
@@ -189,7 +189,7 @@ function cudnntest.TemporalConvolution()
    local input = torch.randn(bs,ini,inputFrameSize):cuda()
    local gradOutput = torch.randn(bs,outi,outputFrameSize):cuda()
    local sconv = nn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si):cuda()
-   local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si):cuda():fastest())
+   local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si):cuda())
    gconv.weight:copy(sconv.weight:view(gconv.weight:size()))
    gconv.bias:copy(sconv.bias)
 
@@ -225,7 +225,7 @@ function cudnntest.TemporalConvolution_padding_batch()
    local groundweight = sconv.gradWeight
    local groundbias = sconv.gradBias
 
-   local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si,pad_h):cuda():fastest())
+   local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si,pad_h):cuda())
    gconv.weight:copy(sconv.weight:view(gconv.weight:size()))
    gconv.bias:copy(sconv.bias)
    gconv:forward(cast(input))
@@ -330,10 +330,14 @@ function cudnntest.VolumetricFullConvolution()
    local outk = (ink-1)*sk+kk
    local scale = math.random()
 
+   if testparams.test_type == 'torch.CudaDoubleTensor' then
+      return
+   end
+
    local input = torch.randn(bs,from,ink,inj,ini):cuda()
    local gradOutput = torch.randn(bs,to,outk,outj,outi):cuda()
    local sconv = nn.VolumetricFullConvolution(from,to,kk,ki,kj,sk,si,sj):cuda()
-   local gconv = cast(cudnn.VolumetricFullConvolution(from,to,kk,ki,kj,sk,si,sj):cuda():fastest())
+   local gconv = cast(cudnn.VolumetricFullConvolution(from,to,kk,ki,kj,sk,si,sj):cuda())
    gconv.weight:copy(sconv.weight)
    gconv.bias:copy(sconv.bias)
 
@@ -936,18 +940,21 @@ mytester:add(cudnntest)
 
 cudnn.verbose=false
 cudnn.find.verbose=false
+-- this is the default, keep it for demo of 16->32 bit float fallback
+cudnn.find.verboseFallback=true
 cudnn.useFindEx=false
 
-for i = 1, cutorch.getDeviceCount() do
-   cudnn.configureMath()
+for i = 1, 1 do -- cutorch.getDeviceCount() do
 
-   for _, benchmark in ipairs({true, false}) do
+   for _, benchmark, fast in ipairs({true, false}) do
       cudnn.benchmark = benchmark
---       cudnn.reset()
+      -- use random fastest() test for non-benchmark case
+      if not benchmark then cudnn.fastest = tostring(math.random(0,1)) end
+
       local prop = cutorch.getDeviceProperties(i)
 
       print('Running test on device: #' .. i .. ' : ' .. prop.name
-               .. ' with benchmark = ' .. tostring(cudnn.benchmark))
+               .. ' with benchmark = ' .. tostring(cudnn.benchmark) .. ' and fastest = ' .. tostring(cudnn.fastest))
 
       cutorch.setDevice(i)
 
@@ -958,14 +965,6 @@ for i = 1, cutorch.getDeviceCount() do
       print( 'Testing torch.CudaHalfTensor, torch.cudnn fp16 math is : ', cudnn.configmap('torch.CudaHalfTensor' ),
              ', cutorch.hasFastHalfInstructions() is ', cutorch.hasFastHalfInstructions())
 
-      if cudnn.configmap('torch.CudaHalfTensor') ~= 'CUDNN_DATA_FLOAT' then
-         print([[ Warning: 32-bit float math is forced for CudaHalfTensor test
-            even though native fast 16-bit float math is available for this device.
-            The reason is cudnn convolution algo find methods for fp16 and certain size combinations may fail.
-            This should be fixed in next release.]])
-         cudnn.configureMath({ ['torch.CudaHalfTensor']   = 'CUDNN_DATA_FLOAT'})
-      end
-
       testparams = testparams_half
       mytester:run()