pytorch · soumith · Jan 14, 2017 · Jan 3, 2017 · Jan 5, 2017 · Jan 5, 2017
diff --git a/torch/lib/TH/THGeneral.c b/torch/lib/TH/THGeneral.c
@@ -109,7 +109,7 @@ void _THArgCheck(const char *file, int line, int condition, int argNumber, const
       snprintf(msg + n, 2048 - n, " at %s:%d", file, line);
     }
 
-    if (threadArgErrorHandlerData)
+    if (threadArgErrorHandler)
       (*threadArgErrorHandler)(argNumber, msg, threadArgErrorHandlerData);
     else
       (*defaultArgErrorHandler)(argNumber, msg, defaultArgErrorHandlerData);

diff --git a/torch/lib/TH/THHalf.c b/torch/lib/TH/THHalf.c
@@ -77,15 +77,17 @@ float TH_half2float(THHalf h)
     }
 
     int temp = ((sign << 31) | (exponent << 23) | mantissa);
-
-    return *((float*)((void*)&temp));
+    float x;
+    memcpy(&x,&temp,sizeof(float));
+    return x;
 }
 
 THHalf TH_float2half(float f)
 {
     THHalf ret;
 
-    unsigned x = *((int*)(void*)(&f));
+    unsigned x;
+    memcpy(&x,&f,sizeof(f));
     unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
     unsigned sign, exponent, mantissa;
 

diff --git a/torch/lib/TH/cmake/FindARM.cmake b/torch/lib/TH/cmake/FindARM.cmake
@@ -68,9 +68,9 @@ if(NOT NEON_FOUND)
       MESSAGE(STATUS "Could not find hardware support for NEON on this machine.")
 endif(NOT NEON_FOUND)
 if(NOT CORTEXA8_FOUND)
-      MESSAGE(STATUS "No OMAP3 processor on this on this machine.")
+      MESSAGE(STATUS "No OMAP3 processor on this machine.")
 endif(NOT CORTEXA8_FOUND)
 if(NOT CORTEXA9_FOUND)
-      MESSAGE(STATUS "No OMAP4 processor on this on this machine.")
+      MESSAGE(STATUS "No OMAP4 processor on this machine.")
 endif(NOT CORTEXA9_FOUND)
 mark_as_advanced(NEON_FOUND)
diff --git a/torch/lib/TH/generic/THTensorCopy.c b/torch/lib/TH/generic/THTensorCopy.c
@@ -4,7 +4,7 @@
 
 void THTensor_(copy)(THTensor *tensor, THTensor *src)
 {
-  TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = (real)(*src_data);)
+  TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
 }
 
 #define IMPLEMENT_THTensor_COPY(TYPENAMESRC, TYPE_SRC) \
@@ -25,6 +25,12 @@ void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src
  TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)TH_half2float(*src_data);) \
 }
 
+#define IMPLEMENT_THTensor_COPY_TO_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+ TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = *src_data;) \
+}
+
 #ifndef TH_REAL_IS_HALF
 IMPLEMENT_THTensor_COPY(Byte, unsigned char)
 IMPLEMENT_THTensor_COPY(Char, char)
@@ -36,7 +42,7 @@ IMPLEMENT_THTensor_COPY(Double, double)
 IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf)
 #else
 /* only allow pass-through for Half */
-IMPLEMENT_THTensor_COPY(Half, THHalf)
+IMPLEMENT_THTensor_COPY_TO_FROM_HALF(Half, THHalf)
 IMPLEMENT_THTensor_COPY_TO_HALF(Byte, unsigned char)
 IMPLEMENT_THTensor_COPY_TO_HALF(Char, char)
 IMPLEMENT_THTensor_COPY_TO_HALF(Short, short)

diff --git a/torch/lib/TH/generic/simd/simd.h b/torch/lib/TH/generic/simd/simd.h
@@ -53,7 +53,7 @@ enum SIMDExtensions
 };
 
 
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
 
  #if defined(__NEON__)
 
@@ -80,7 +80,7 @@ static inline uint32_t detectHostSIMDExtensions()
   return SIMDExtension_VSX;
 }
 
- #else
+ #else //PPC64 without VSX
 
 static inline uint32_t detectHostSIMDExtensions()
 {

diff --git a/torch/lib/THC/CMakeLists.txt b/torch/lib/THC/CMakeLists.txt
@@ -268,6 +268,7 @@ INSTALL(FILES
           THCTensorTypeUtils.cuh
           THCTensorRandom.cuh
           THCTensorMathMagma.cuh
+          THCThrustAllocator.cuh
           DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
 
 INSTALL(FILES

diff --git a/torch/lib/THC/THCAtomics.cuh b/torch/lib/THC/THCAtomics.cuh
@@ -110,7 +110,7 @@ static inline  __device__ void atomicAdd(half *address, half val) {
 }
 #endif
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
 // from CUDA C Programmic Guide
 static inline  __device__  void atomicAdd(double *address, double val) {
   unsigned long long int* address_as_ull = (unsigned long long int*)address;
@@ -126,6 +126,9 @@ static inline  __device__  void atomicAdd(double *address, double val) {
     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
   } while (assumed != old);
 }
+#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000)
+// This needs to be defined for the host side pass
+static inline  __device__  void atomicAdd(double *address, double val) { }
 #endif
 
 #endif // THC_ATOMICS_INC
diff --git a/torch/lib/THC/THCGeneral.c b/torch/lib/THC/THCGeneral.c
@@ -1,7 +1,6 @@
 #include "THCGeneral.h"
 #include "TH.h"
 #include "THCAllocator.h"
-#include "THCBlas.h"
 #include "THCCachingHostAllocator.h"
 #include "THCStream.h"
 #include "THCThreadLocal.h"
@@ -10,7 +9,12 @@
 #include <stdint.h>
 
 /* Size of scratch space available in global memory per each SM + stream */
-#define GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float)
+#define MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float)
+
+/* Minimum amount of scratch space per device. Total scratch memory per
+ * device is either this amount, or the # of SMs * the space per SM defined
+ * above, whichever is greater.*/
+#define MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE 32768 * sizeof(float)
 
 THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
   THCState *state, int device);
@@ -108,9 +112,15 @@ void THCudaInit(THCState* state)
     res->streams[0] = NULL;
 
     /* The scratch space that we want to have available per each device is
-       based on the number of SMs available per device */
+       based on the number of SMs available per device. We guarantee a
+       minimum of 128kb of space per device, but to future-proof against
+       future architectures that may have huge #s of SMs, we guarantee that
+       we have at least 16 bytes for each SM. */
     int numSM = state->deviceProperties[i].multiProcessorCount;
-    size_t sizePerStream = numSM * GLOBAL_SCRATCH_SPACE_PER_SM_STREAM;
+    size_t sizePerStream =
+      MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE >= numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM ?
+      MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE :
+      numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM;
     res->scratchSpacePerStream = sizePerStream;
   }
 
@@ -753,7 +763,8 @@ void THCHeapUpdate(THCState *state, ptrdiff_t size) {
   }
 }
 
-#undef GLOBAL_SCRATCH_SPACE_PER_SM_STREAM
+#undef MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM
+#undef MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE
 
 #include "THCStorage.c"
 #include "THCAllocator.c"
diff --git a/torch/lib/THC/THCHalf.cu b/torch/lib/THC/THCHalf.cu
@@ -1,4 +1,5 @@
 #include "THCHalf.h"
+#include "THCThrustAllocator.cuh"
 #include <thrust/transform.h>
 #include <thrust/execution_policy.h>
 
@@ -11,19 +12,21 @@ struct __float2halfOp {
 };
 
 void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len) {
+  THCThrustAllocator thrustAlloc(state);
   thrust::transform(
 #if CUDA_VERSION >= 7000
-    thrust::cuda::par.on(THCState_getCurrentStream(state)),
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 #else
     thrust::device,
 #endif
     in, in + len, out, __float2halfOp());
 }
 
 void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
+  THCThrustAllocator thrustAlloc(state);
   thrust::transform(
 #if CUDA_VERSION >= 7000
-    thrust::cuda::par.on(THCState_getCurrentStream(state)),
+    thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 #else
     thrust::device,
 #endif
@@ -58,14 +61,17 @@ float THC_half2float(half h)
 
   int temp = ((sign << 31) | (exponent << 23) | mantissa);
 
-  return *((float*)((void*)&temp));
+  float x;
+  memcpy(&x,&temp,sizeof(float));
+  return x;
 }
 
 half THC_float2half(float f)
 {
   half ret;
 
-  unsigned x = *((int*)(void*)(&f));
+  unsigned x;
+  memcpy(&x,&f,sizeof(f));
   unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
   unsigned sign, exponent, mantissa;
 

diff --git a/torch/lib/THC/THCReduceAll.cuh b/torch/lib/THC/THCReduceAll.cuh
@@ -137,6 +137,11 @@ inline ptrdiff_t getTwoPassBlocks(THCState* state, ptrdiff_t elements) {
     THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(AccT);
   THAssert(scratchSpace > 0);
 
+  // Limit to 1024 due to dimensionality constraint
+  if (scratchSpace > 1024) {
+    scratchSpace = 1024;
+  }
+
   if (numBlocks > scratchSpace) {
     numBlocks = scratchSpace;
   }

diff --git a/torch/lib/THC/THCStorage.cu b/torch/lib/THC/THCStorage.cu
@@ -1,5 +1,6 @@
 #include "THCStorage.h"
 
+#include "THCThrustAllocator.cuh"
 #include <thrust/device_ptr.h>
 #include <thrust/fill.h>
 #if CUDA_VERSION >= 7000

diff --git a/torch/lib/THC/THCStorageCopy.c b/torch/lib/THC/THCStorageCopy.c
@@ -1,7 +1,5 @@
 #include "THCStorageCopy.h"
-#include "THCGeneral.h"
 
-#include "THCHalf.h"
 #include "THCTensorCopy.h"
 
 #include "generic/THCStorageCopy.c"

diff --git a/torch/lib/THC/THCTensorCopy.c b/torch/lib/THC/THCTensorCopy.c
@@ -1,9 +1,5 @@
 #include "THCTensorCopy.h"
-#include "THCGeneral.h"
-#include "THCTensor.h"
 #include "THCCachingHostAllocator.h"
 
-#include "THCHalf.h"
-
 #include "generic/THCTensorCopy.c"
 #include "THCGenerateAllTypes.h"
diff --git a/torch/lib/THC/THCTensorMasked.cuh b/torch/lib/THC/THCTensorMasked.cuh
@@ -5,6 +5,7 @@
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCReduce.cuh"
+#include "THCThrustAllocator.cuh"
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>

diff --git a/torch/lib/THC/THCTensorMath.cu b/torch/lib/THC/THCTensorMath.cu
@@ -4,6 +4,7 @@
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"
 #include "THCTensorMath.cuh"
+#include "THCThrustAllocator.cuh"
 
 #include <thrust/copy.h>
 #include <thrust/count.h>

diff --git a/torch/lib/THC/THCTensorMath.cuh b/torch/lib/THC/THCTensorMath.cuh
@@ -23,4 +23,80 @@ __global__ void THCTensor_copyToDiagonal(T* a, T* b, ptrdiff_t start, ptrdiff_t
   }
 }
 
+#define CAT_ARRAY_BATCH_SIZE 1024
+#define CAT_ARRAY_MAX_INPUT_DIMS 4
+
+// Similar to any other IndexToOffset calculation for copying along a given dimension.
+template <typename IndexType, int Dims>
+struct CatArrIndexToOffset {
+  static inline __device__ IndexType compute(
+      const IndexType outputSize[Dims],
+      const IndexType outputStride[Dims],
+      const IndexType dimSize,
+      const unsigned int concatDim,
+      IndexType linearIndex) {
+    IndexType offset = 0;
+
+#pragma unroll
+    for (int i = Dims - 1; i >= 1; --i) {
+      IndexType curDimSize = i == concatDim ? dimSize : outputSize[i];
+      IndexType nextDimIndex = linearIndex / curDimSize;
+      IndexType curDimIndex = linearIndex - curDimSize * nextDimIndex;
+      IndexType curDimOffset = curDimIndex * outputStride[i];
+      offset += curDimOffset;
+      linearIndex = nextDimIndex;
+    }
+
+    return offset + linearIndex * outputStride[0];
+  }
+};
+
+template <typename T, typename IndexType>
+struct CatArrInputTensor {
+  T* input;
+  IndexType offset;
+  IndexType dimSize;
+  IndexType nElements;
+};
+
+template<typename IndexType, unsigned int MaxDims>
+struct OutputTensorSizeStride {
+  IndexType outputSize[MaxDims];
+  IndexType outputStride[MaxDims];
+};
+
+/**
+  * Kernel used to concatenated grimDim.y tensors into an output tensor. Uses a grid-stride loop based off of
+  * the blockIdx.x, threadIdx.x for each input to copy each element from each input tensor into the output.
+  *
+  * output: base pointer to the storage associated with the output tensor
+  * inputs: GPU-allocated array of input metadata for each input to concatenate in the kernel
+  * os: the size/stride vectors for the output tensor
+  * concatDim: dimension along which we are concatenating
+  * dimStride: the stride of the output tensor at the concatDim
+  *
+  * The most important assumption made is that the input tensors are contiguous.
+  */
+template <typename T, typename IndexType, int Dims>
+__global__ void CatArrayBatchedCopy(
+    T* output,
+    CatArrInputTensor<T, IndexType>* inputs,
+    OutputTensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType dimStride) {
+  T* data = inputs[blockIdx.y].input;
+  IndexType offset = inputs[blockIdx.y].offset;
+  IndexType dimSize = inputs[blockIdx.y].dimSize;
+  IndexType nElements = inputs[blockIdx.y].nElements;
+  IndexType dataOffset = offset * dimStride;
+
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+      linearIndex < nElements;
+      linearIndex += gridDim.x * blockDim.x) {
+    IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
+        os.outputSize, os.outputStride, dimSize, concatDim, linearIndex);
+    output[dataOffset + elementOffset] = data[linearIndex];
+  }
+}
+
 #endif
diff --git a/torch/lib/THC/THCTensorMathReduce.cuh b/torch/lib/THC/THCTensorMathReduce.cuh
@@ -6,6 +6,7 @@
 #include "THCNumerics.cuh"
 #include "THCReduce.cuh"
 #include "THCReduceAll.cuh"
+#include "THCThrustAllocator.cuh"
 #include <thrust/functional.h>
 #include <thrust/device_ptr.h>
 #include <thrust/transform_reduce.h>

diff --git a/torch/lib/THC/THCTensorSort.cuh b/torch/lib/THC/THCTensorSort.cuh
@@ -6,6 +6,7 @@
 #include "THCTensorCopy.h"
 #include "THCTensorTypeUtils.cuh"
 
+#include "THCThrustAllocator.cuh"
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 #if CUDA_VERSION >= 7000