pytorch · xwang233 · Oct 6, 2020 · Oct 6, 2020
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
@@ -193,11 +193,14 @@ namespace at { namespace cuda { using namespace c10::hip; }}
 #define C10_DEVICE __device__
 #define C10_HOST __host__
 // constants from (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
-// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
-// but 2048 for previous architectures. You'll get warnings if you exceed these constants.
+// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5),
+// 1536 for Geforce Ampere (8.6),
+// and 2048 for all other architectures. You'll get warnings if you exceed these constants.
 // Hence, the following macros adjust the input values from the user to resolve potential warnings.
 #if __CUDA_ARCH__ == 750
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
+#elif __CUDA_ARCH__ == 860
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
 #else
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
 #endif