diff --git a/CMakeLists.txt b/CMakeLists.txt index ad9342580cdf6..c76a942089188 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -571,6 +571,9 @@ if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /Zc:__cplusplus") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:lambda") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /Zc:lambda") + set(CMAKE_NINJA_CMCLDEPS_RC OFF) if(MSVC_Z7_OVERRIDE) # CMake set debug flags to use /Z7 diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index d787d0850ab67..11d3ea4acb8e7 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -50,7 +50,6 @@ set(ATen_PUBLIC_HIP_DEPENDENCY_LIBS) set(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory") set(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory") set(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory") -set(MEM_EFF_ATTENTION_CUDA_SOURCES) set(TH_LINK_STYLE STATIC) set(TH_CPU_INCLUDE @@ -149,5 +148,4 @@ set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) set(FLASH_ATTENTION_CUDA_SOURCES ${FLASH_ATTENTION_CUDA_SOURCES} PARENT_SCOPE) -set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE) set(ATen_ATTENTION_KERNEL_SRCS ${ATen_ATTENTION_KERNEL_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 0d90f8a34e404..b5e3314255f92 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -248,7 +248,6 @@ if(USE_MEM_EFF_ATTENTION) list(APPEND native_transformers_cuda_cu ${mem_eff_attention_cuda_cu}) list(APPEND native_transformers_cuda_cu ${mem_eff_attention_cuda_kernels_cu}) list(APPEND native_transformers_cuda_cpp ${mem_eff_attention_cuda_cpp}) - list(APPEND MEM_EFF_ATTENTION_CUDA_SOURCES ${native_transformers_cuda_cu} ${mem_eff_attention_cuda_cu} ${mem_eff_attention_cuda_kernels_cu}) list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu}) endif() @@ -914,5 +913,4 @@ set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE) set(FLASH_ATTENTION_CUDA_SOURCES ${FLASH_ATTENTION_CUDA_SOURCES} PARENT_SCOPE) -set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE) set(ATen_ATTENTION_KERNEL_SRCS ${ATen_ATTENTION_KERNEL_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index c06731dfc718c..d5d1f7b692b2d 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -54,8 +54,8 @@ static inline void cpu_cum_base_kernel(const Tensor& result, .add_const_input(self) .build(); - auto result_dim_stride = ensure_nonempty_stride(result, dim); - auto self_dim_stride = ensure_nonempty_stride(self, dim); + const int64_t result_dim_stride = ensure_nonempty_stride(result, dim); + const int64_t self_dim_stride = ensure_nonempty_stride(self, dim); auto loop = [&](char** data, const int64_t* strides, int64_t n) { auto* result_data_bytes = data[0]; @@ -82,8 +82,8 @@ static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, self.scalar_type(), "cumsum_out_cpu", [&] { cpu_cum_base_kernel(result, self, wrap_dim, [&] ( - scalar_t* result_data, auto result_dim_stride, - const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) { + scalar_t* result_data, int64_t result_dim_stride, + const scalar_t* self_data, int64_t self_dim_stride, scalar_t init_val) { // NOLINTNEXTLINE(bugprone-signed-char-misuse) auto cum_number = (at::acc_type)init_val; for (const auto i : c10::irange(self_dim_size)) { @@ -101,8 +101,8 @@ static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, self.scalar_type(), "cumprod_out_cpu", [&] { cpu_cum_base_kernel(result, self, wrap_dim, [&] ( - scalar_t* result_data, auto result_dim_stride, - const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) { + scalar_t* result_data, int64_t result_dim_stride, + const scalar_t* self_data, int64_t self_dim_stride, scalar_t init_val) { // NOLINTNEXTLINE(bugprone-signed-char-misuse) auto cum_number = (at::acc_type)init_val; for (const auto i : c10::irange(self_dim_size)) { @@ -120,8 +120,8 @@ static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, self.scalar_type(), "logcumsumexp_out_cpu", [&] { cpu_cum_base_kernel(result, self, wrap_dim, [&] ( - scalar_t* result_data, auto result_dim_stride, - const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) { + scalar_t* result_data, int64_t result_dim_stride, + const scalar_t* self_data, int64_t self_dim_stride, scalar_t init_val) { using accscalar_t = at::acc_type; auto cum_number = (accscalar_t)init_val; for (const auto i : c10::irange(self_dim_size)) { diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp index 2c52a61fc553a..eb8e8a7c85505 100644 --- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp +++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp @@ -77,7 +77,7 @@ static inline void compare_base_kernel(const Tensor& result1, const Tensor& resu bool keepdim, const func_t& f) { - auto self_dim_stride = ensure_nonempty_stride(self, dim); + const int64_t self_dim_stride = ensure_nonempty_stride(self, dim); auto loop = [&](char** data, const int64_t* strides, int64_t n) { auto* result1_data_bytes = data[0]; @@ -104,12 +104,12 @@ static void min_kernel_impl( const Tensor& self, int64_t dim, bool keepdim) { - int64_t self_dim_size = ensure_nonempty_size(self, dim); + const int64_t self_dim_size = ensure_nonempty_size(self, dim); AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] { compare_base_kernel(result, indice, self, dim, keepdim, [&] ( scalar_t* result_data, int64_t* indice_data, - const scalar_t* self_data, auto self_dim_stride) { + const scalar_t* self_data, int64_t self_dim_stride) { using value_t = typename c10::scalar_value_type::type; value_t (*zabs_)(scalar_t) = zabs; scalar_t min_number = c10::load(self_data); @@ -137,12 +137,12 @@ static void max_kernel_impl( const Tensor& self, int64_t dim, bool keepdim) { - int64_t self_dim_size = ensure_nonempty_size(self, dim); + const int64_t self_dim_size = ensure_nonempty_size(self, dim); AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "max_cpu", [&] { compare_base_kernel(result, indice, self, dim, keepdim, [&] ( scalar_t* result_data, int64_t* indice_data, - const scalar_t* self_data, auto self_dim_stride) { + const scalar_t* self_data, int64_t self_dim_stride) { using value_t = typename c10::scalar_value_type::type; value_t (*zabs_)(scalar_t) = zabs; scalar_t max_number = c10::load(self_data); @@ -171,7 +171,7 @@ static void aminmax_kernel( Tensor& min_result, Tensor& max_result) { auto wrap_dim = maybe_wrap_dim(dim, self.dim()); - int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); + const int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); TORCH_CHECK(min_result.scalar_type() == self.scalar_type() && max_result.scalar_type() == self.scalar_type(), "Expect min and max dtype ", self.scalar_type(), @@ -189,7 +189,7 @@ static void aminmax_kernel( AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "aminmax_cpu", [&] { compare_base_kernel(min_result, max_result, self, wrap_dim, keepdim, [&] ( scalar_t* min_result_data, scalar_t* max_result_data, - const scalar_t* self_data, auto self_dim_stride) { + const scalar_t* self_data, int64_t self_dim_stride) { scalar_t min_number = c10::load(self_data); scalar_t max_number = min_number; for (const auto i : c10::irange(self_dim_size)) { diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 10191edee9cd6..0689b74995007 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -2005,18 +2005,4 @@ if(BUILD_TEST) endforeach() endif() endif() - -if(MSVC) - # This is used to enable the conforming lambda processor in MSVC - # Which allows us to capture constexpr in lambdas - # Note that this will be turned on by default for std=c++20 and above - # This should be applied globally when https://github.com/pytorch/pytorch/issues/92600 is fixed - foreach(tmp ${MEM_EFF_ATTENTION_CUDA_SOURCES}) - # MEM_EFF_ATTENTION_CUDA is populated in pytorch/aten/src/ATen/CMakeLists.txt - # We iterate over these files, updating paths and adding the compile flag - FILE(RELATIVE_PATH tmp_path "${PROJECT_SOURCE_DIR}" "${tmp}") - SET(tmp_path "../${tmp_path}") - set_source_files_properties(${tmp_path} PROPERTIES COMPILE_FLAGS "-Xcompiler /Zc:lambda") - endforeach() -endif() endif()