Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
[ghstack-poisoned]
  • Loading branch information
xmfan committed May 16, 2024
2 parents 2fca582 + 9056fd9 commit 0bc1814
Show file tree
Hide file tree
Showing 86 changed files with 1,846 additions and 468 deletions.
2 changes: 1 addition & 1 deletion .ci/docker/common/install_acl.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -euo pipefail

readonly version=v23.08
readonly version=v24.04
readonly src_host=https://review.mlplatform.org/ml
readonly src_repo=ComputeLibrary

Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ jobs:
submodules: true
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
pip install onnx==1.16.0
pip install numpy==1.26.4
export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT"
.github/scripts/lintrunner.sh
Expand Down
2 changes: 2 additions & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ cu_library(
name = "torch_cuda",
srcs = [
"torch/csrc/distributed/c10d/intra_node_comm.cu",
"torch/csrc/distributed/c10d/Utils.cu",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
],
copts = torch_cuda_half_options,
Expand Down Expand Up @@ -830,6 +831,7 @@ cc_library(
"torch/csrc/cuda/python_nccl.cpp",
"torch/csrc/cuda/nccl.cpp",
"torch/csrc/distributed/c10d/intra_node_comm.cu",
"torch/csrc/distributed/c10d/Utils.cu",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
],
)) + torch_sources,
Expand Down
59 changes: 59 additions & 0 deletions aten/src/ATen/native/FusedAdagrad.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/native/DispatchStub.h>
#include <ATen/native/FusedAdagrad.h>

#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_fused_adagrad.h>
#include <ATen/ops/_fused_adagrad_native.h>
#endif
namespace at {

namespace native {

void _fused_adagrad_kernel_cpu_(
at::TensorList params,
at::TensorList grads,
at::TensorList state_sums,
at::TensorList state_steps,
const double lr,
const double lr_decay,
const double weight_decay,
const double eps,
const bool maximize,
const c10::optional<at::Tensor>& grad_scale,
const c10::optional<at::Tensor>& found_inf) {
const float* grad_scale_ptr =
grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
const float* found_inf_ptr =
found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
if (found_inf_ptr && *found_inf_ptr == 1.0) {
return;
}
size_t n_tensors = params.size();
TORCH_CHECK(grads.size() == n_tensors);
TORCH_CHECK(state_sums.size() == n_tensors);
TORCH_CHECK(state_steps.size() == n_tensors);
for (size_t i = 0; i < n_tensors; i++){
fused_adagrad_stub(
kCPU,
params[i],
grads[i],
state_sums[i],
state_steps[i],
lr,
lr_decay,
weight_decay,
eps,
maximize,
grad_scale_ptr);
}
}

DEFINE_DISPATCH(fused_adagrad_stub);

}
}
23 changes: 23 additions & 0 deletions aten/src/ATen/native/FusedAdagrad.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#include <ATen/core/Tensor.h>
#include <ATen/native/DispatchStub.h>

namespace at {

namespace native {

using fused_adagrad_fn = void (*)(
const at::Tensor& param,
const at::Tensor& grad,
const at::Tensor& state_sum,
const at::Tensor& state_step,
const double lr,
const double lr_decay,
const double weight_decay,
const double eps,
const bool maximize,
const float* grad_scale_ptr);

DECLARE_DISPATCH(fused_adagrad_fn, fused_adagrad_stub);

}
}
8 changes: 8 additions & 0 deletions aten/src/ATen/native/ReduceAllOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_aminmax_native.h>
#include <ATen/ops/aminmax.h>
#include <ATen/ops/empty.h>
#include <ATen/ops/max.h>
Expand Down Expand Up @@ -65,4 +66,11 @@ Tensor& max_unary_out(const Tensor &self, Tensor& out) {
return out;
}

// DEPRECATED: Use at::aminmax instead
std::tuple<Tensor, Tensor> _aminmax_all(const Tensor &self) {
TORCH_WARN_ONCE("_aminmax is deprecated as of PyTorch 1.11 and will be removed in a future release. Use aminmax instead."
" This warning will only appear once per process.");
return at::aminmax(self);
}

} // namespace at::native
8 changes: 8 additions & 0 deletions aten/src/ATen/native/TensorCompare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_aminmax_native.h>
#include <ATen/ops/_assert_async_native.h>
#include <ATen/ops/_functional_assert_async_native.h>
#include <ATen/ops/_print_native.h>
Expand Down Expand Up @@ -681,6 +682,13 @@ std::tuple<Tensor, Tensor> qmin(const Tensor& self, int64_t dim, bool keepdim) {
at::_make_per_tensor_quantized_tensor(min, self.q_scale(), self.q_zero_point()), min_indices);
}

// DEPRECATED: Use at::aminmax instead
std::tuple<Tensor, Tensor> _aminmax(const Tensor& self, int64_t dim, bool keepdim) {
TORCH_WARN_ONCE("_aminmax is deprecated as of PyTorch 1.11 and will be removed in a future release. Use aminmax instead."
" This warning will only appear once per process.");
return at::aminmax(self, dim, keepdim);
}

TORCH_IMPL_FUNC(clamp_out)
(
const Tensor& /*self*/,
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/native/TypeProperties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ ScalarType result_type(const Scalar& scalar1, const Scalar& scalar2) {
return result_type(state);
}

bool can_cast(const at::ScalarType from_, const at::ScalarType to) {
return at::canCast(from_, to);
bool can_cast(const at::ScalarType from, const at::ScalarType to) {
return at::canCast(from, to);
}

ScalarType promote_types(ScalarType type1, ScalarType type2) {
Expand Down
225 changes: 225 additions & 0 deletions aten/src/ATen/native/cpu/FusedAdagradKernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/Parallel.h>
#include <ATen/OpMathType.h>
#include <ATen/native/DispatchStub.h>
#include <ATen/native/FusedAdagrad.h>
#include <ATen/Dispatch.h>
#include <ATen/cpu/vec/vec.h>
#include <ATen/cpu/vec/functional.h>
namespace at::native {

namespace{

template <typename scalar_t, typename opmath_t>
typename std::enable_if<
std::is_same<scalar_t, Half>::value || std::is_same<scalar_t, BFloat16>::value,
void>::
type inline adagrad_math(
scalar_t* param_ptr,
scalar_t* grad_ptr,
scalar_t* state_sum_ptr,
const double clr,
const double eps,
const double weight_decay,
const bool maximize,
const float* grad_scale_ptr,
int64_t size
){
using lpVec = at::vec::Vectorized<scalar_t>;
using fVec = at::vec::Vectorized<opmath_t>;
lpVec grad_vec_to_store;
fVec param_vec1, param_vec2;
fVec grad_vec1, grad_vec2;
fVec state_sum_vec1, state_sum_vec2;
int64_t d = 0;
for (; d < size - (size % lpVec::size()); d += lpVec::size()) {
lpVec param_lpvec = lpVec::loadu(param_ptr + d);
std::tie(param_vec1, param_vec2) = vec::convert_to_float<scalar_t>(param_lpvec);
lpVec grad_lpvec = lpVec::loadu(grad_ptr + d);
std::tie(grad_vec1, grad_vec2) = vec::convert_to_float<scalar_t>(grad_lpvec);
if (grad_scale_ptr) {
grad_vec1 = grad_vec1 / fVec(float(*grad_scale_ptr));
grad_vec2 = grad_vec2 / fVec(float(*grad_scale_ptr));
grad_vec_to_store = vec::convert_from_float<scalar_t>(grad_vec1, grad_vec2);
grad_vec_to_store.store(grad_ptr + d);
}
if (maximize){
grad_vec1 = grad_vec1 * fVec(opmath_t(-1.0));
grad_vec2 = grad_vec2 * fVec(opmath_t(-1.0));
}
if (weight_decay != 0.0){
grad_vec1 += param_vec1 * fVec(scalar_t(weight_decay));
grad_vec2 += param_vec2 * fVec(scalar_t(weight_decay));
}
std::tie(state_sum_vec1, state_sum_vec2) = vec::convert_to_float<scalar_t>(lpVec::loadu(state_sum_ptr + d));
state_sum_vec1 += grad_vec1 * grad_vec1;
state_sum_vec2 += grad_vec2 * grad_vec2;
vec::convert_from_float<scalar_t>(state_sum_vec1, state_sum_vec2).store(state_sum_ptr + d);

fVec std_vec1 = state_sum_vec1.sqrt() + fVec(scalar_t(eps));
fVec std_vec2 = state_sum_vec2.sqrt() + fVec(scalar_t(eps));
param_vec1 = param_vec1 - fVec(scalar_t(clr)) * grad_vec1 / std_vec1;
param_vec2 = param_vec2 - fVec(scalar_t(clr)) * grad_vec2 / std_vec2;
vec::convert_from_float<scalar_t>(param_vec1, param_vec2).store(param_ptr + d);
}
scalar_t grad_val_to_store;
for (; d < size; d++) {
opmath_t grad_val = grad_ptr[d];
opmath_t param_val = param_ptr[d];
if (grad_scale_ptr) {
grad_val = grad_ptr[d] / opmath_t(*grad_scale_ptr);
grad_val_to_store = grad_val;
grad_ptr[d] = grad_val_to_store;
}
if (maximize) grad_val = -grad_val;
if (weight_decay != 0.0){
grad_val += param_val * opmath_t(weight_decay);
}
opmath_t state_sum_val = state_sum_ptr[d];
state_sum_val += grad_val * grad_val;
state_sum_ptr[d] = state_sum_val;
opmath_t std_val = std::sqrt(state_sum_val) + opmath_t(eps);
param_val -= opmath_t(clr) * grad_val / std_val;
param_ptr[d] = param_val;
}
}


template <typename scalar_t, typename opmath_t>
typename std::enable_if<
std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value,
void>::
type inline adagrad_math(
scalar_t* param_ptr,
scalar_t* grad_ptr,
scalar_t* state_sum_ptr,
const double clr,
const double eps,
const double weight_decay,
const bool maximize,
const float* grad_scale_ptr,
int64_t size
){
using Vec = at::vec::Vectorized<scalar_t>;
Vec grad_vec_to_store;
int64_t d = 0;
for (; d < size - (size % Vec::size()); d += Vec::size()) {
Vec param_vec = Vec::loadu(param_ptr + d);
Vec grad_vec = Vec::loadu(grad_ptr + d);
if (grad_scale_ptr) {
grad_vec = grad_vec / Vec(scalar_t(*grad_scale_ptr));
grad_vec_to_store = grad_vec;
grad_vec_to_store.store(grad_ptr + d);
}
if (maximize) grad_vec = grad_vec * Vec(scalar_t(-1.0));
if (weight_decay != 0.0){
grad_vec += param_vec * Vec(scalar_t(weight_decay));
}

Vec sum_vec = Vec::loadu(state_sum_ptr + d) + grad_vec * grad_vec;
sum_vec.store(state_sum_ptr + d);

Vec std_vec = sum_vec.sqrt() + Vec(scalar_t(eps));
param_vec = param_vec - Vec(scalar_t(clr)) * grad_vec / std_vec;
param_vec.store(param_ptr + d);
}
scalar_t grad_val_to_store;
for (; d < size; d++) {
scalar_t grad_val = grad_ptr[d];
if (grad_scale_ptr) {
grad_val = grad_ptr[d] / scalar_t(*grad_scale_ptr);
grad_val_to_store = grad_val;
grad_ptr[d] = grad_val_to_store;
}
if (maximize) grad_val = -grad_val;
if (weight_decay != 0.0){
grad_val += param_ptr[d] * scalar_t(weight_decay);
}
state_sum_ptr[d] += grad_val * grad_val;

scalar_t std_val = std::sqrt(state_sum_ptr[d]) + scalar_t(eps);
param_ptr[d] -= scalar_t(clr) * grad_val / std_val;
}
}

template <typename scalar_t>
void adagrad_fused_step_impl(
const at::Tensor& param,
const at::Tensor& grad,
const at::Tensor& state_sum,
const at::Tensor& state_step,
const double lr,
const double lr_decay,
const double weight_decay,
const double eps,
const bool maximize,
const float* grad_scale_ptr) {
using opmath_t = at::opmath_type<scalar_t>;
scalar_t* param_data = param.data_ptr<scalar_t>();
scalar_t* grad_data = grad.data_ptr<scalar_t>();
scalar_t* state_sum_data = state_sum.data_ptr<scalar_t>();
double step = state_step.item<float>();
double clr = lr / (1.0 + (step - 1.0) * lr_decay);

constexpr size_t cache_line_size = 64;
constexpr int64_t cache_line_aligned_task_unit = cache_line_size / sizeof(scalar_t);
size_t num_units = divup(param.numel(), cache_line_aligned_task_unit);

auto adagrad_fn = [&](int64_t begin, int64_t end) {
// local pointers
begin *= cache_line_aligned_task_unit;
end = std::min(end * cache_line_aligned_task_unit, param.numel());
scalar_t* param_ptr = param_data + begin;
scalar_t* grad_ptr = grad_data + begin;
scalar_t* state_sum_ptr = state_sum_data + begin;

const int64_t size = end - begin;
adagrad_math<scalar_t, opmath_t>(
param_ptr,
grad_ptr,
state_sum_ptr,
clr,
eps,
weight_decay,
maximize,
grad_scale_ptr,
size
);
};
at::parallel_for(
0, num_units, 0, adagrad_fn);
}

void fused_adagrad_kernel(
const at::Tensor& param,
const at::Tensor& grad,
const at::Tensor& state_sum,
const at::Tensor& state_step,
const double lr,
const double lr_decay,
const double weight_decay,
const double eps,
const bool maximize,
const float* grad_scale_ptr
) {
Tensor grad_contiguous = grad.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, param.scalar_type(), "fused_adagrad_kernel", [&] {
adagrad_fused_step_impl<scalar_t>(
param,
grad,
state_sum,
state_step,
lr,
lr_decay,
weight_decay,
eps,
maximize,
grad_scale_ptr);
});
}

}

REGISTER_DISPATCH(fused_adagrad_stub, &fused_adagrad_kernel);
} // namespace at::native
Loading

0 comments on commit 0bc1814

Please sign in to comment.