Skip to content

Commit

Permalink
Merge branch 'pytorch:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
ynonaolga committed Nov 14, 2022
2 parents 957c79c + c8f3d1c commit 1ada67e
Show file tree
Hide file tree
Showing 154 changed files with 3,321 additions and 3,185 deletions.
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/vision.txt
@@ -1 +1 @@
d72e90640ec8514e0369b5419d7f3b74a387b1d7
deba056203d009fec6b58afb9fa211f6ee3328c8
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/xla.txt
@@ -1 +1 @@
08121e41079319cd369f82f523f5a714a0563f9d
dd9b67ff0d6ba4da6a46ca1b22e35c98dbed0d77
2 changes: 1 addition & 1 deletion aten/src/ATen/InferSize.h
Expand Up @@ -80,7 +80,7 @@ inline at::SymDimVector infer_size_dv(
c10::SymInt numel) {
auto res = at::SymDimVector(shape);
infer_size_impl<c10::SymIntArrayRef, c10::SymInt, at::SymDimVector>(
shape, numel, res);
shape, std::move(numel), res);
return res;
}

Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/core/Formatting.cpp
Expand Up @@ -13,7 +13,7 @@ std::ostream& operator<<(std::ostream & out, Backend b) {
return out << toString(b);
}

std::ostream& operator<<(std::ostream & out, Scalar s) {
std::ostream& operator<<(std::ostream & out, const Scalar& s) {
if (s.isFloatingPoint()) {
return out << s.toDouble();
}
Expand All @@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream & out, Scalar s) {
throw std::logic_error("Unknown type in Scalar");
}

std::string toString(Scalar s) {
std::string toString(const Scalar& s) {
std::stringstream out;
out << s;
return out.str();
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/core/Formatting.h
Expand Up @@ -8,8 +8,8 @@

namespace c10 {
TORCH_API std::ostream& operator<<(std::ostream& out, Backend b);
TORCH_API std::ostream& operator<<(std::ostream & out, Scalar s);
TORCH_API std::string toString(Scalar s);
TORCH_API std::ostream& operator<<(std::ostream & out, const Scalar& s);
TORCH_API std::string toString(const Scalar& s);
}
namespace at {

Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/core/List_test.cpp
Expand Up @@ -1118,7 +1118,7 @@ TEST(ListTest, canAccessStringByReference) {
List<std::string> list({"one", "two"});
const auto& listRef = list;
static_assert(std::is_same<decltype(listRef[1]), const std::string&>::value,
"const List<std::string> acccess should be by const reference");
"const List<std::string> access should be by const reference");
std::string str = list[1];
const std::string& strRef = listRef[1];
EXPECT_EQ("two", str);
Expand All @@ -1130,7 +1130,7 @@ TEST(ListTest, canAccessOptionalStringByReference) {
const auto& listRef = list;
static_assert(
std::is_same<decltype(listRef[1]), c10::optional<std::reference_wrapper<const std::string>>>::value,
"List<c10::optional<std::string>> acccess should be by const reference");
"List<c10::optional<std::string>> access should be by const reference");
c10::optional<std::string> str1 = list[1];
c10::optional<std::string> str2 = list[2];
decltype(auto) strRef1 = listRef[1];
Expand Down
5 changes: 4 additions & 1 deletion aten/src/ATen/core/PythonFallbackKernel.cpp
Expand Up @@ -74,10 +74,13 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
(*interpreter)->dispatch(op, stack);
return;
}
} else if (ivalue.isTensorList() || (ivalue.isOptionalTensorList() && !ivalue.isNone())) {
} else if (ivalue.isTensorList() || ivalue.isOptionalTensorList()) {
// NB: use toListRef as it doesn't induce refcount bumps (toTensorListRef
// is not a thing)
for (const auto& nv : ivalue.toListRef()) {
if (nv.isNone()) {
continue;
}
auto* interpreter = nv.unsafeToTensorImpl()->pyobj_interpreter();
if (interpreter) {
(*interpreter)->dispatch(op, stack);
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/core/class_type.cpp
Expand Up @@ -86,7 +86,7 @@ std::string ClassType::getForwardPreHookErrorMessage(int pre_hook_idx) const {
std::string pre_hook_schema =
pre_hook_name + "(self, input: Tuple[" + input_types + "])";
std::string return_string =
"This error occured while scripting the forward pre-hook '" +
"This error occurred while scripting the forward pre-hook '" +
pre_hook_name + "' on module '" + name()->name() +
"'. If you did not want to script this pre-hook remove it from the "
"original NN module before scripting. Pre-hooks for module '" +
Expand All @@ -111,7 +111,7 @@ std::string ClassType::getForwardHookErrorMessage(int hook_idx) const {
std::string hook_schema = hook_name + "(self, input: Tuple[" +
input_types + "], output: " + output_types + ")";
std::string return_string =
"This error occured while scripting the forward hook '"
"This error occurred while scripting the forward hook '"
+ hook_name + "' on module " + name()->name() +
". If you did not want to script this hook remove it from" +
" the original NN module before scripting. This hook was" +
Expand Down
195 changes: 195 additions & 0 deletions aten/src/ATen/cpu/vec/vec256/vec256_int.h
Expand Up @@ -1133,6 +1133,201 @@ inline Vectorized<int8_t> Vectorized<int8_t>::le(const Vectorized<int8_t>& other
return (*this <= other) & Vectorized<int8_t>(1);
}

template <bool left_shift>
Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
// No vector instruction for shifting int16_t, so emulating it instead.

// Control masks for shuffle operation, treating 256 bits as an
// array of 16-bit elements, and considering pairs of neighboring
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
// M!=N) is set so that shuffle will move element with index M from
// input pair into element with index N in output pair, and element
// with index M in output pair will be set to all 0s.
__m256i ctl_0_1 = _mm256_set_epi8(29, 28, 0x80, 0x80, 25, 24, 0x80, 0x80,
21, 20, 0x80, 0x80, 17, 16, 0x80, 0x80,
13, 12, 0x80, 0x80, 9, 8, 0x80, 0x80,
5, 4, 0x80, 0x80, 1, 0, 0x80, 0x80);
__m256i ctl_1_0 = _mm256_set_epi8(0x80, 0x80, 31, 30, 0x80, 0x80, 27, 26,
0x80, 0x80, 23, 22, 0x80, 0x80, 19, 18,
0x80, 0x80, 15, 14, 0x80, 0x80, 11, 10,
0x80, 0x80, 7, 6, 0x80, 0x80, 3, 2);

// Masks for bitwise and operation, treating 256 bits as an array of
// 16-bit elements, and considering them in pairs of neighboring
// elements. A mask named "keep_M" (M in [0,1]) is set so that
// bitwise and will copy element with index M from input pair into
// element with the same index in output pair, while the other
// element in output pair will be set to all 0s.
__m256i keep_0 = _mm256_set1_epi32(0xFFFF);
__m256i keep_1 = _mm256_set1_epi32(0xFFFF0000);

// Take each 16-bit element with idx%2==0 from input array to be
// shifted and extend it to 32 bits so that 0s are added to the
// right. Then, perform shifting on this 32-bit number. Upper 16
// bits will be proper result of shifting original 16-bit number, so
// write them to result array, into the same position from which
// corresponding input element is taken. Also, make sure that
// result array elements with idx%2!=0 are set to all 0s.
//
// Note that number of bits to shift for is extended to 32 bits by
// adding 0s to the left. That means this number is not properly
// sign-extended for negative values. However, number of bits to
// shift is treated as an unsigned integer by respective shift
// intrinsics anyway so if negative then either with or without
// proper sign extension, it will be interpreted as a number greater
// than 32, and the shifting result will be the same.
__m256i a0 = _mm256_shuffle_epi8(a, ctl_0_1);
__m256i b0 = _mm256_and_si256(b, keep_0);
__m256i c0;
if (left_shift)
c0 = _mm256_sllv_epi32(a0, b0);
c0 = _mm256_shuffle_epi8(c0, ctl_1_0);

// Peform shifting the same way for input array elements with
// idx%2==1.
__m256i a1 = _mm256_and_si256(a, keep_1);
__m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
__m256i c1;
if (left_shift)
c1 = _mm256_sllv_epi32(a1, b1);
c1 = _mm256_and_si256(c1, keep_1);

// Merge partial results into the final result.
__m256i c = _mm256_or_si256(c0, c1);

return c;
}

template <bool left_shift>
Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
// No vector instruction for shifting int8_t, so emulating it instead.

// Control masks for shuffle operation, treating 256 bits as an
// array of 8-bit elements, and considering quadruples of
// neighboring elements. Specifially, a mask named "ctl_M_N" (M,N
// in [0,1,2,3], and M!=N) is set so that shuffle will move element
// with index M from input quadruple into element with index N in
// output quadruple, and other elements in output quadruple will be
// set to all 0s.
__m256i ctl_0_3 = _mm256_set_epi8(28, 0x80, 0x80, 0x80, 24, 0x80, 0x80, 0x80,
20, 0x80, 0x80, 0x80, 16, 0x80, 0x80, 0x80,
12, 0x80, 0x80, 0x80, 8, 0x80, 0x80, 0x80,
4, 0x80, 0x80, 0x80, 0, 0x80, 0x80, 0x80);
__m256i ctl_1_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 29, 0x80, 0x80, 0x80, 25,
0x80, 0x80, 0x80, 21, 0x80, 0x80, 0x80, 17,
0x80, 0x80, 0x80, 13, 0x80, 0x80, 0x80, 9,
0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 1);
__m256i ctl_1_3 = _mm256_set_epi8(29, 0x80, 0x80, 0x80, 25, 0x80, 0x80, 0x80,
21, 0x80, 0x80, 0x80, 17, 0x80, 0x80, 0x80,
13, 0x80, 0x80, 0x80, 9, 0x80, 0x80, 0x80,
5, 0x80, 0x80, 0x80, 1, 0x80, 0x80, 0x80);
__m256i ctl_2_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 30, 0x80, 0x80, 0x80, 26,
0x80, 0x80, 0x80, 22, 0x80, 0x80, 0x80, 18,
0x80, 0x80, 0x80, 14, 0x80, 0x80, 0x80, 10,
0x80, 0x80, 0x80, 6, 0x80, 0x80, 0x80, 2);
__m256i ctl_2_3 = _mm256_set_epi8(30, 0x80, 0x80, 0x80, 26, 0x80, 0x80, 0x80,
22, 0x80, 0x80, 0x80, 18, 0x80, 0x80, 0x80,
14, 0x80, 0x80, 0x80, 10, 0x80, 0x80, 0x80,
6, 0x80, 0x80, 0x80, 2, 0x80, 0x80, 0x80);
__m256i ctl_3_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 31, 0x80, 0x80, 0x80, 27,
0x80, 0x80, 0x80, 23, 0x80, 0x80, 0x80, 19,
0x80, 0x80, 0x80, 15, 0x80, 0x80, 0x80, 11,
0x80, 0x80, 0x80, 7, 0x80, 0x80, 0x80, 3);
__m256i ctl_3_1 = _mm256_set_epi8(0x80, 0x80, 31, 0x80, 0x80, 0x80, 27, 0x80,
0x80, 0x80, 23, 0x80, 0x80, 0x80, 19, 0x80,
0x80, 0x80, 15, 0x80, 0x80, 0x80, 11, 0x80,
0x80, 0x80, 7, 0x80, 0x80, 0x80, 3, 0x80);
__m256i ctl_3_2 = _mm256_set_epi8(0x80, 31, 0x80, 0x80, 0x80, 27, 0x80, 0x80,
0x80, 23, 0x80, 0x80, 0x80, 19, 0x80, 0x80,
0x80, 15, 0x80, 0x80, 0x80, 11, 0x80, 0x80,
0x80, 7, 0x80, 0x80, 0x80, 3, 0x80, 0x80);

// Masks for bitwise and operation, treating 256 bits as an array of
// 8-bit elements, and considering them in quadruples of neighboring
// elements. A mask named "keep_M" (M in [0,1,2,3]) is set so that
// bitwise and will copy element with index M from input quadruple
// into element with the same index in output quadruple, while the
// other elements in output quadruple will be set to all 0s.
__m256i keep_0 = _mm256_set1_epi32(0xFF);
__m256i keep_3 = _mm256_set1_epi32(0xFF000000);

// Take each 8-bit element with idx%4==0 from input array to be
// shifted and extend it to 32 bits so that 0s are added to the
// right. Then, perform shifting on this 32-bit number. Upper 8
// bits will be proper result of shifting original 8-bit number, so
// write them to result array, into the same position from which
// corresponding input element is taken. Also, make sure that
// result array elements with idx%4!=0 are set to all 0s.
//
// Note that number of bits to shift for is extended to 32 bits by
// adding 0s to the left. That means this number is not properly
// sign-extended for negative values. However, number of bits to
// shift is treated as an unsigned integer by respective shift
// intrinsics anyway so if negative then either with or without
// proper sign extension, it will be interpreted as a number greater
// than 32, and the shifting result will be the same.
__m256i a0 = _mm256_shuffle_epi8(a, ctl_0_3);
__m256i b0 = _mm256_and_si256(b, keep_0);
__m256i c0;
if (left_shift)
c0 = _mm256_sllv_epi32(a0, b0);
c0 = _mm256_shuffle_epi8(c0, ctl_3_0);

// Peform shifting the same way for input array elements with
// idx%4==1.
__m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
__m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
__m256i c1;
if (left_shift)
c1 = _mm256_sllv_epi32(a1, b1);
c1 = _mm256_shuffle_epi8(c1, ctl_3_1);

// Peform shifting the same way for input array elements with
// idx%4==2.
__m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
__m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
__m256i c2;
if (left_shift)
c2 = _mm256_sllv_epi32(a2, b2);
c2 = _mm256_shuffle_epi8(c2, ctl_3_2);

// Peform shifting the same way for input array elements with
// idx%4==3.
__m256i a3 = _mm256_and_si256(a, keep_3);
__m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
__m256i c3;
if (left_shift)
c3 = _mm256_sllv_epi32(a3, b3);
c3 = _mm256_and_si256(c3, keep_3);

// Merge partial results into the final result.
__m256i c01 = _mm256_or_si256(c0, c1);
__m256i c23 = _mm256_or_si256(c2, c3);
__m256i c = _mm256_or_si256(c01, c23);

return c;
}

template <>
Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
return _mm256_sllv_epi64(a, b);
}

template <>
Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
return _mm256_sllv_epi32(a, b);
}

template <>
Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
return shift_256_16<true>(a, b);
}

template <>
Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
return shift_256_8<true>(a, b);
}

#endif

}}}
93 changes: 93 additions & 0 deletions aten/src/ATen/cpu/vec/vec512/vec512_int.h
Expand Up @@ -1163,6 +1163,99 @@ inline Vectorized<int8_t> Vectorized<int8_t>::le(const Vectorized<int8_t>& other
return (*this <= other) & Vectorized<int8_t>(1);
}

template <bool left_shift>
Vectorized<int8_t> inline shift_512_8(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
// No vector instruction for shifting int8_t, so emulating it instead.

// Control masks for shuffle operation, treating 512 bits as an
// array of 8-bit elements, and considering pairs of neighboring
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
// M!=N) is set so that shuffle will move element with index M from
// input pair into element with index N in output pair, and element
// with index M in output pair will be set to all 0s.
__m512i ctl_0_1 = _mm512_set_epi8(62, 0x80, 60, 0x80, 58, 0x80, 56, 0x80,
54, 0x80, 52, 0x80, 50, 0x80, 48, 0x80,
46, 0x80, 44, 0x80, 42, 0x80, 40, 0x80,
38, 0x80, 36, 0x80, 34, 0x80, 32, 0x80,
30, 0x80, 28, 0x80, 26, 0x80, 24, 0x80,
22, 0x80, 20, 0x80, 18, 0x80, 16, 0x80,
14, 0x80, 12, 0x80, 10, 0x80, 8, 0x80,
6, 0x80, 4, 0x80, 2, 0x80, 0, 0x80);
__m512i ctl_1_0 = _mm512_set_epi8(0x80, 63, 0x80, 61, 0x80, 59, 0x80, 57,
0x80, 55, 0x80, 53, 0x80, 51, 0x80, 49,
0x80, 47, 0x80, 45, 0x80, 43, 0x80, 41,
0x80, 39, 0x80, 37, 0x80, 35, 0x80, 33,
0x80, 31, 0x80, 29, 0x80, 27, 0x80, 25,
0x80, 23, 0x80, 21, 0x80, 19, 0x80, 17,
0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9,
0x80, 7, 0x80, 5, 0x80, 3, 0x80, 1);

// Masks for bitwise and operation, treating 512 bits as an array of
// 8-bit elements, and considering them in pairs of neighboring
// elements. A mask named "keep_M" (M in [0,1]) is set so that
// bitwise and will copy element with index M from input pair into
// element with the same index in output pair, while the other
// element in output pair will be set to all 0s.
__m512i keep_0 = _mm512_set1_epi16(0xFF);
__m512i keep_1 = _mm512_set1_epi16(0xFF00);

// Take each 8-bit element with idx%2==0 from input array to be
// shifted and extend it to 16 bits so that 0s are added to the
// right. Then, perform shifting on this 16-bit number. Upper 8
// bits will be proper result of shifting original 8-bit number, so
// write them to result array, into the same position from which
// corresponding input element is taken. Also, make sure that
// result array elements with idx%2!=0 are set to all 0s.
//
// Note that number of bits to shift for is extended to 16 bits by
// adding 0s to the left. That means this number is not properly
// sign-extended for negative values. However, number of bits to
// shift is treated as an unsigned integer by respective shift
// intrinsics anyway so if negative then either with or without
// proper sign extension, it will be interpreted as a number greater
// than 32, and the shifting result will be the same.
__m512i a0 = _mm512_shuffle_epi8(a, ctl_0_1);
__m512i b0 = _mm512_and_si512(b, keep_0);
__m512i c0;
if (left_shift)
c0 = _mm512_sllv_epi16(a0, b0);
c0 = _mm512_shuffle_epi8(c0, ctl_1_0);

// Peform shifting the same way for input array elements with
// idx%2==1.
__m512i a1 = _mm512_and_si512(a, keep_1);
__m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);
__m512i c1;
if (left_shift)
c1 = _mm512_sllv_epi16(a1, b1);
c1 = _mm512_and_si512(c1, keep_1);

// Merge partial results into the final result.
__m512i c = _mm512_or_si512(c0, c1);

return c;
}

template <>
Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
return _mm512_sllv_epi64(a, b);
}

template <>
Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
return _mm512_sllv_epi32(a, b);
}

template <>
Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
return _mm512_sllv_epi16(a, b);
}

template <>
Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
return shift_512_8<true>(a, b);
}

#endif

}}}

0 comments on commit 1ada67e

Please sign in to comment.