Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,9 @@ if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /Zc:__cplusplus")

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:lambda")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /Zc:lambda")

set(CMAKE_NINJA_CMCLDEPS_RC OFF)
if(MSVC_Z7_OVERRIDE)
# CMake set debug flags to use /Z7
Expand Down
2 changes: 0 additions & 2 deletions aten/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ set(ATen_PUBLIC_HIP_DEPENDENCY_LIBS)
set(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory")
set(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory")
set(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory")
set(MEM_EFF_ATTENTION_CUDA_SOURCES)

set(TH_LINK_STYLE STATIC)
set(TH_CPU_INCLUDE
Expand Down Expand Up @@ -149,5 +148,4 @@ set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
set(FLASH_ATTENTION_CUDA_SOURCES ${FLASH_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
set(ATen_ATTENTION_KERNEL_SRCS ${ATen_ATTENTION_KERNEL_SRCS} PARENT_SCOPE)
2 changes: 0 additions & 2 deletions aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,6 @@ if(USE_MEM_EFF_ATTENTION)
list(APPEND native_transformers_cuda_cu ${mem_eff_attention_cuda_cu})
list(APPEND native_transformers_cuda_cu ${mem_eff_attention_cuda_kernels_cu})
list(APPEND native_transformers_cuda_cpp ${mem_eff_attention_cuda_cpp})
list(APPEND MEM_EFF_ATTENTION_CUDA_SOURCES ${native_transformers_cuda_cu} ${mem_eff_attention_cuda_cu} ${mem_eff_attention_cuda_kernels_cu})
list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
endif()

Expand Down Expand Up @@ -914,5 +913,4 @@ set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
set(FLASH_ATTENTION_CUDA_SOURCES ${FLASH_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
set(ATen_ATTENTION_KERNEL_SRCS ${ATen_ATTENTION_KERNEL_SRCS} PARENT_SCOPE)
16 changes: 8 additions & 8 deletions aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ static inline void cpu_cum_base_kernel(const Tensor& result,
.add_const_input(self)
.build();

auto result_dim_stride = ensure_nonempty_stride(result, dim);
auto self_dim_stride = ensure_nonempty_stride(self, dim);
const int64_t result_dim_stride = ensure_nonempty_stride(result, dim);
const int64_t self_dim_stride = ensure_nonempty_stride(self, dim);

auto loop = [&](char** data, const int64_t* strides, int64_t n) {
auto* result_data_bytes = data[0];
Expand All @@ -82,8 +82,8 @@ static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t

AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, self.scalar_type(), "cumsum_out_cpu", [&] {
cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
scalar_t* result_data, auto result_dim_stride,
const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
scalar_t* result_data, int64_t result_dim_stride,
const scalar_t* self_data, int64_t self_dim_stride, scalar_t init_val) {
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
auto cum_number = (at::acc_type<scalar_t, false>)init_val;
for (const auto i : c10::irange(self_dim_size)) {
Expand All @@ -101,8 +101,8 @@ static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t

AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, self.scalar_type(), "cumprod_out_cpu", [&] {
cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
scalar_t* result_data, auto result_dim_stride,
const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
scalar_t* result_data, int64_t result_dim_stride,
const scalar_t* self_data, int64_t self_dim_stride, scalar_t init_val) {
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
auto cum_number = (at::acc_type<scalar_t, false>)init_val;
for (const auto i : c10::irange(self_dim_size)) {
Expand All @@ -120,8 +120,8 @@ static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t

AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, self.scalar_type(), "logcumsumexp_out_cpu", [&] {
cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
scalar_t* result_data, auto result_dim_stride,
const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
scalar_t* result_data, int64_t result_dim_stride,
const scalar_t* self_data, int64_t self_dim_stride, scalar_t init_val) {
using accscalar_t = at::acc_type<scalar_t, false>;
auto cum_number = (accscalar_t)init_val;
for (const auto i : c10::irange(self_dim_size)) {
Expand Down
14 changes: 7 additions & 7 deletions aten/src/ATen/native/cpu/TensorCompareKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ static inline void compare_base_kernel(const Tensor& result1, const Tensor& resu
bool keepdim,
const func_t& f) {

auto self_dim_stride = ensure_nonempty_stride(self, dim);
const int64_t self_dim_stride = ensure_nonempty_stride(self, dim);

auto loop = [&](char** data, const int64_t* strides, int64_t n) {
auto* result1_data_bytes = data[0];
Expand All @@ -104,12 +104,12 @@ static void min_kernel_impl(
const Tensor& self,
int64_t dim,
bool keepdim) {
int64_t self_dim_size = ensure_nonempty_size(self, dim);
const int64_t self_dim_size = ensure_nonempty_size(self, dim);

AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] {
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
scalar_t* result_data, int64_t* indice_data,
const scalar_t* self_data, auto self_dim_stride) {
const scalar_t* self_data, int64_t self_dim_stride) {
using value_t = typename c10::scalar_value_type<scalar_t>::type;
value_t (*zabs_)(scalar_t) = zabs<scalar_t, value_t>;
scalar_t min_number = c10::load(self_data);
Expand Down Expand Up @@ -137,12 +137,12 @@ static void max_kernel_impl(
const Tensor& self,
int64_t dim,
bool keepdim) {
int64_t self_dim_size = ensure_nonempty_size(self, dim);
const int64_t self_dim_size = ensure_nonempty_size(self, dim);

AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "max_cpu", [&] {
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
scalar_t* result_data, int64_t* indice_data,
const scalar_t* self_data, auto self_dim_stride) {
const scalar_t* self_data, int64_t self_dim_stride) {
using value_t = typename c10::scalar_value_type<scalar_t>::type;
value_t (*zabs_)(scalar_t) = zabs<scalar_t, value_t>;
scalar_t max_number = c10::load(self_data);
Expand Down Expand Up @@ -171,7 +171,7 @@ static void aminmax_kernel(
Tensor& min_result,
Tensor& max_result) {
auto wrap_dim = maybe_wrap_dim(dim, self.dim());
int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
const int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);

TORCH_CHECK(min_result.scalar_type() == self.scalar_type() && max_result.scalar_type() == self.scalar_type(),
"Expect min and max dtype ", self.scalar_type(),
Expand All @@ -189,7 +189,7 @@ static void aminmax_kernel(
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "aminmax_cpu", [&] {
compare_base_kernel<scalar_t, scalar_t>(min_result, max_result, self, wrap_dim, keepdim, [&] (
scalar_t* min_result_data, scalar_t* max_result_data,
const scalar_t* self_data, auto self_dim_stride) {
const scalar_t* self_data, int64_t self_dim_stride) {
scalar_t min_number = c10::load(self_data);
scalar_t max_number = min_number;
for (const auto i : c10::irange(self_dim_size)) {
Expand Down
14 changes: 0 additions & 14 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2005,18 +2005,4 @@ if(BUILD_TEST)
endforeach()
endif()
endif()

if(MSVC)
# This is used to enable the conforming lambda processor in MSVC
# Which allows us to capture constexpr in lambdas
# Note that this will be turned on by default for std=c++20 and above
# This should be applied globally when https://github.com/pytorch/pytorch/issues/92600 is fixed
foreach(tmp ${MEM_EFF_ATTENTION_CUDA_SOURCES})
# MEM_EFF_ATTENTION_CUDA is populated in pytorch/aten/src/ATen/CMakeLists.txt
# We iterate over these files, updating paths and adding the compile flag
FILE(RELATIVE_PATH tmp_path "${PROJECT_SOURCE_DIR}" "${tmp}")
SET(tmp_path "../${tmp_path}")
set_source_files_properties(${tmp_path} PROPERTIES COMPILE_FLAGS "-Xcompiler /Zc:lambda")
endforeach()
endif()
endif()
Loading