Skip to content

Commit

Permalink
Update on "[quant][graphmode][fx] Add functional conv2d + relu"
Browse files Browse the repository at this point in the history
Summary:
Added support for functional conv2d + relu, will add conv1d and conv3d in future PR

Test Plan:
python test/test_quantization.py TestQuantizeFxOps.test_functional_conv

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D26089964](https://our.internmc.facebook.com/intern/diff/D26089964)

[ghstack-poisoned]
  • Loading branch information
jerryzh168 committed Jan 27, 2021
2 parents 9e342e6 + da91265 commit 1ee5d11
Show file tree
Hide file tree
Showing 150 changed files with 5,467 additions and 1,452 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
@@ -1,7 +1,7 @@
[submodule "third_party/pybind11"]
ignore = dirty
path = third_party/pybind11
url = https://github.com/seemethere/pybind11.git
url = https://github.com/pybind/pybind11.git
[submodule "third_party/cub"]
ignore = dirty
path = third_party/cub
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/CMakeLists.txt
Expand Up @@ -325,6 +325,7 @@ if(USE_CUDA AND NOT USE_ROCM)
${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a
${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static_nocallback.a
${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver_static.a
${CUDA_TOOLKIT_ROOT_DIR}/lib64/liblapack_static.a # needed for libcusolver_static
)
else()
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
Expand Down
47 changes: 0 additions & 47 deletions aten/src/ATen/LegacyTHFunctionsCPU.cpp
Expand Up @@ -776,53 +776,6 @@ std::tuple<Tensor,Tensor> _th_geqrf(const Tensor & self) {
}
return std::tuple<Tensor, Tensor>(res1, res2);
}
Tensor & _th_orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) {
// DeviceGuard omitted
auto dispatch_scalar_type = infer_scalar_type(self);

switch (dispatch_scalar_type) {
case ScalarType::Double: {
auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
THDoubleTensor_orgqr(result_, self_, input2_);
break;
}
case ScalarType::Float: {
auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
THFloatTensor_orgqr(result_, self_, input2_);
break;
}
default:
AT_ERROR("_th_orgqr_out not supported on CPUType for ", dispatch_scalar_type);
}
return result;
}
Tensor _th_orgqr(const Tensor & self, const Tensor & input2) {
// DeviceGuard omitted
auto dispatch_scalar_type = infer_scalar_type(self);
auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
switch (dispatch_scalar_type) {
case ScalarType::Double: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type);
auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type);
THDoubleTensor_orgqr(result_, self_, input2_);
break;
}
case ScalarType::Float: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type);
auto input2_ = checked_dense_tensor_unwrap(input2, "input2", 2, "_th_orgqr", false, DeviceType::CPU, dispatch_scalar_type);
THFloatTensor_orgqr(result_, self_, input2_);
break;
}
default:
AT_ERROR("_th_orgqr not supported on CPUType for ", dispatch_scalar_type);
}
return result;
}
Tensor & _th_ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) {
// DeviceGuard omitted
auto dispatch_scalar_type = infer_scalar_type(self);
Expand Down
2 changes: 0 additions & 2 deletions aten/src/ATen/LegacyTHFunctionsCPU.h
Expand Up @@ -42,8 +42,6 @@ Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper);
Tensor _th_potri(const Tensor & self, bool upper);
std::tuple<Tensor &,Tensor &> _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self);
std::tuple<Tensor,Tensor> _th_geqrf(const Tensor & self);
Tensor & _th_orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2);
Tensor _th_orgqr(const Tensor & self, const Tensor & input2);
Tensor & _th_ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose);
Tensor _th_ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose);

Expand Down
28 changes: 28 additions & 0 deletions aten/src/ATen/core/Vitals.cpp
@@ -0,0 +1,28 @@
#include <ATen/core/Vitals.h>
#include <cstdlib>

TorchVitalAttr& TorchVital::create(const std::string& attr) {
if (!torchVitalEnabled()) {
static TorchVitalAttr disabled;
return disabled;
}
auto iter = attrs.find(attr);
if (iter == attrs.end()) {
auto r = attrs.emplace(std::make_pair(attr, TorchVitalAttr()));
return r.first->second;
}
return iter->second;
}

bool torchVitalEnabled() {
// If this is a performance hit, make `enabled` variable static
// and return `const bool&` instead
bool enabled = []() {
auto e = getenv("TORCH_VITAL");
if (e != nullptr) {
return strlen(e) > 0;
}
return false;
}();
return enabled;
}
44 changes: 44 additions & 0 deletions aten/src/ATen/core/Vitals.h
@@ -0,0 +1,44 @@
#pragma once
#include <cstring>
#include <iostream>
#include <sstream>
#include <unordered_map>

bool torchVitalEnabled();

struct TorchVitalAttr {
// always initialized to empty
std::string value = "";
template <typename T>
TorchVitalAttr& operator<<(const T& t) {
if (torchVitalEnabled()) {
std::stringstream ss;
ss << t;
value += ss.str();
}
return *this;
}
};

struct TorchVital {
std::string name;
std::unordered_map<std::string, TorchVitalAttr> attrs;

explicit TorchVital(std::string n) : name(std::move(n)) {}
TorchVital() = delete;

TorchVitalAttr& create(const std::string& attr);

~TorchVital() {
for (const auto& m : attrs) {
std::cout << "[TORCH_VITAL] " << name << "." << m.first << "\t\t "
<< m.second.value << "\n";
}
}
};

#define TORCH_VITAL_DECLARE(name) extern TorchVital TorchVital_##name;

#define TORCH_VITAL_DEFINE(name) TorchVital TorchVital_##name(#name);

#define TORCH_VITAL(name, attr) TorchVital_##name.create(#attr)
2 changes: 0 additions & 2 deletions aten/src/ATen/cpu/vec256/vec256_base.h
Expand Up @@ -21,8 +21,6 @@
#include <bitset>

#include <ATen/cpu/vec256/intrinsics.h>
#include <ATen/Utils.h>
#include <ATen/native/Copy.h>
#include <ATen/native/Math.h>
#include <ATen/NumericUtils.h>
#include <c10/util/C++17.h>
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/cpu/vec256/vec256_qint.h
Expand Up @@ -5,7 +5,7 @@

#include <ATen/cpu/vec256/intrinsics.h>
#include <ATen/cpu/vec256/vec256_base.h>
#include <ATen/native/quantized/affine_quantizer.h>
#include <ATen/native/quantized/affine_quantizer_base.h>
#include <c10/util/qint32.h>
#include <c10/util/qint8.h>
#include <c10/util/quint8.h>
Expand Down
54 changes: 21 additions & 33 deletions aten/src/ATen/cuda/CUDABlas.cpp
Expand Up @@ -327,7 +327,6 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {

#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
TORCH_CHECK(prop->major >= 8, "BFloat16 bgemm in CUDA requires Ampere or later GPU");
TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedExFix(handle,
opa, opb, (int)m, (int)n, (int)k,
(void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
Expand All @@ -343,7 +342,7 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
(int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
0, 0, NULL, NULL));
#else
TORCH_CHECK(false, "BFloat16 bgemm in CUDA requires Ampere or later GPU");
TORCH_CHECK(false, "CUDA BFloat16 bgemm requires CUDA 11 or later");
#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000
}
#endif // __HIP_PLATFORM_HCC__
Expand Down Expand Up @@ -550,37 +549,26 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
float fbeta = beta;
_cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
GEMM_CHECK_ARGVALUES(at::BFloat16);
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
if (prop->major >= 8) {
// On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH
// manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required.
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
TORCH_CUDABLAS_CHECK(cublasGemmEx(
handle,
opa,
opb,
m,
n,
k,
&falpha,
a,
CUDA_R_16BF,
lda,
b,
CUDA_R_16BF,
ldb,
&fbeta,
c,
CUDA_R_16BF,
ldc,
CUDA_R_32F,
CUBLAS_GEMM_DFALT_TENSOR_OP));
// On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH
// manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required.
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
} else {
TORCH_CHECK(false, "BFloat16 gemm in CUDA requires Ampere or later GPU");
}
TORCH_CUDABLAS_CHECK(cublasGemmEx(
handle,
opa,
opb,
m,
n,
k,
&falpha,
a,
CUDA_R_16BF,
lda,
b,
CUDA_R_16BF,
ldb,
&fbeta,
c,
CUDA_R_16BF,
ldc,
CUDA_R_32F,
CUBLAS_GEMM_DFALT_TENSOR_OP));
}
#endif

Expand Down

0 comments on commit 1ee5d11

Please sign in to comment.