Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
[ghstack-poisoned]
  • Loading branch information
XuehaiPan committed Jun 18, 2024
2 parents cd9894a + 32e5ded commit 4573c4e
Show file tree
Hide file tree
Showing 113 changed files with 1,467 additions and 1,420 deletions.
2 changes: 1 addition & 1 deletion .ci/pytorch/common_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ function clone_pytorch_xla() {
function checkout_install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
git clone https://github.com/eellison/benchmark torchbench
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ test_cpu_speed_mini_sequence_labeler () {
export OMP_NUM_THREADS=4
export MKL_NUM_THREADS=4

git clone https://github.com/eellison/benchmark.git
git clone https://github.com/pytorch/benchmark.git

cd benchmark/

Expand Down
2 changes: 1 addition & 1 deletion .ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ test_gpu_speed_cudnn_lstm () {
export OMP_NUM_THREADS=4
export MKL_NUM_THREADS=4

git clone https://github.com/eellison/benchmark.git
git clone https://github.com/pytorch/benchmark.git

cd benchmark/

Expand Down
2 changes: 1 addition & 1 deletion .ci/pytorch/perf_test/test_gpu_speed_lstm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ test_gpu_speed_lstm () {
export OMP_NUM_THREADS=4
export MKL_NUM_THREADS=4

git clone https://github.com/eellison/benchmark.git
git clone https://github.com/pytorch/benchmark.git

cd benchmark/

Expand Down
2 changes: 1 addition & 1 deletion .ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ test_gpu_speed_mlstm () {
export OMP_NUM_THREADS=4
export MKL_NUM_THREADS=4

git clone https://github.com/eellison/benchmark.git
git clone https://github.com/pytorch/benchmark.git

cd benchmark/

Expand Down
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/torchbench.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pin_yolo_dep
0dab1dd97709096e8129f8a08115ee83f64f2194
53 changes: 53 additions & 0 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -1534,6 +1534,59 @@ exclude_patterns = [
'torch/linalg/__init__.py',
'torch/monitor/__init__.py',
'torch/nested/__init__.py',
'torch/nn/intrinsic/__init__.py',
'torch/nn/intrinsic/modules/__init__.py',
'torch/nn/intrinsic/modules/fused.py',
'torch/nn/intrinsic/qat/__init__.py',
'torch/nn/intrinsic/qat/modules/__init__.py',
'torch/nn/intrinsic/qat/modules/conv_fused.py',
'torch/nn/intrinsic/qat/modules/linear_fused.py',
'torch/nn/intrinsic/qat/modules/linear_relu.py',
'torch/nn/intrinsic/quantized/__init__.py',
'torch/nn/intrinsic/quantized/dynamic/__init__.py',
'torch/nn/intrinsic/quantized/dynamic/modules/__init__.py',
'torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py',
'torch/nn/intrinsic/quantized/modules/__init__.py',
'torch/nn/intrinsic/quantized/modules/bn_relu.py',
'torch/nn/intrinsic/quantized/modules/conv_relu.py',
'torch/nn/intrinsic/quantized/modules/linear_relu.py',
'torch/nn/qat/__init__.py',
'torch/nn/qat/dynamic/__init__.py',
'torch/nn/qat/dynamic/modules/__init__.py',
'torch/nn/qat/dynamic/modules/linear.py',
'torch/nn/qat/modules/__init__.py',
'torch/nn/qat/modules/conv.py',
'torch/nn/qat/modules/embedding_ops.py',
'torch/nn/qat/modules/linear.py',
'torch/nn/quantizable/__init__.py',
'torch/nn/quantizable/modules/__init__.py',
'torch/nn/quantizable/modules/activation.py',
'torch/nn/quantizable/modules/rnn.py',
'torch/nn/quantized/__init__.py',
'torch/nn/quantized/_reference/__init__.py',
'torch/nn/quantized/_reference/modules/__init__.py',
'torch/nn/quantized/_reference/modules/conv.py',
'torch/nn/quantized/_reference/modules/linear.py',
'torch/nn/quantized/_reference/modules/rnn.py',
'torch/nn/quantized/_reference/modules/sparse.py',
'torch/nn/quantized/_reference/modules/utils.py',
'torch/nn/quantized/dynamic/__init__.py',
'torch/nn/quantized/dynamic/modules/__init__.py',
'torch/nn/quantized/dynamic/modules/conv.py',
'torch/nn/quantized/dynamic/modules/linear.py',
'torch/nn/quantized/dynamic/modules/rnn.py',
'torch/nn/quantized/functional.py',
'torch/nn/quantized/modules/__init__.py',
'torch/nn/quantized/modules/activation.py',
'torch/nn/quantized/modules/batchnorm.py',
'torch/nn/quantized/modules/conv.py',
'torch/nn/quantized/modules/dropout.py',
'torch/nn/quantized/modules/embedding_ops.py',
'torch/nn/quantized/modules/functional_modules.py',
'torch/nn/quantized/modules/linear.py',
'torch/nn/quantized/modules/normalization.py',
'torch/nn/quantized/modules/rnn.py',
'torch/nn/quantized/modules/utils.py',
'torch/signal/__init__.py',
'torch/signal/windows/__init__.py',
'torch/signal/windows/windows.py',
Expand Down
5 changes: 5 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ RUN case ${TARGETPLATFORM} in \
esac && \
/opt/conda/bin/conda clean -ya
RUN /opt/conda/bin/pip install torchelastic
RUN IS_CUDA=$(python -c 'import torch ; print(torch.cuda._is_compiled())'); \
echo "Is torch compiled with cuda: ${IS_CUDA}"; \
if test "${IS_CUDA}" != "True" -a ! -z "${CUDA_VERSION}"; then \
exit 1; \
fi

FROM ${BASE_IMAGE} as official
ARG PYTORCH_VERSION
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/cpu/vec/vec_mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<, ~a& b)
VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator==, ~(a ^ b))
VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>=, (a == b) | (a > b))
VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<=, (a == b) | (a < b))
VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator!=, (a ^ b))

#undef VEC_MASK_DEFINE_UNARY_OP_GLOBAL
#undef VEC_MASK_DEFINE_BINARY_OP_GLOBAL
Expand Down
76 changes: 3 additions & 73 deletions aten/src/ATen/cuda/tunable/GemmCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,7 @@ struct GemmParams : OpParams {
}

std::string Signature() const override {
static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
return val;
return c10::str(transa, transb, "_", m, "_", n, "_", k);
}

size_t GetSize(bool duplicate_inputs) const {
Expand Down Expand Up @@ -144,82 +143,14 @@ struct GemmParams : OpParams {
bool duplicate_inputs_;
};

template <typename T>
struct GemmAndBiasParams : OpParams {
std::string Signature() const override {
static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
return val;
}

size_t GetSize(bool duplicate_inputs) const {
size_t size = sizeof(T) * ldc * n;
if (duplicate_inputs) {
size += sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
size += sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
}
return size;
}

GemmAndBiasParams* DeepCopy(bool duplicate_inputs) const {
GemmAndBiasParams* copy = new GemmAndBiasParams;
*copy = *this;
c10::DeviceIndex device = 0;
AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
size_t c_size = ldc * n * sizeof(T);
copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
if (duplicate_inputs) {
size_t a_size = sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
size_t b_size = sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
copy->duplicate_inputs_ = true;
}
return copy;
}

// only call on object returned by DeepCopy
void Delete() {
c10::cuda::CUDACachingAllocator::raw_delete(c);
if (duplicate_inputs_) {
c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
}
}

TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
auto c_dtype = c10::CppTypeToScalarType<T>::value;
return detail::NumericalCheck(c_dtype, c, other->c, ldc*n) ? OK : FAIL;
}

char transa;
char transb;
int64_t m;
int64_t n;
int64_t k;
at::opmath_type<T> alpha;
const T* a;
int64_t lda;
const T* b;
int64_t ldb;
T* c;
int64_t ldc;
const T* bias;
at::cuda::blas::GEMMAndBiasActivationEpilogue activation;
private:
bool duplicate_inputs_;
};

template <typename T>
struct GemmStridedBatchedParams : OpParams {
GemmStridedBatchedParams() {
duplicate_inputs_ = false;
}

std::string Signature() const override {
static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
return val;
return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
}

size_t GetSize(bool duplicate_inputs) const {
Expand Down Expand Up @@ -292,8 +223,7 @@ struct ScaledGemmParams : OpParams {
}

std::string Signature() const override {
static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
return val;
return c10::str(transa, transb, "_", m, "_", n, "_", k);
}

size_t GetSize(bool duplicate_inputs) const {
Expand Down
Loading

0 comments on commit 4573c4e

Please sign in to comment.