Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vulkan tweaks #47261

Closed
14 changes: 0 additions & 14 deletions aten/src/ATen/native/vulkan/Vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1038,12 +1038,6 @@ ComputeUnit& ComputeUnitFactory::get(
// VBuffer <-> VImage
void copy_buffer_to_image(const VBuffer& buffer, VImage& image) {
const auto device = context().device();
struct ConstBlock {
int32_t w;
int32_t h;
};
const ConstBlock constBlock{image.w(), image.h()};
VBuffer constBuffer = makeUniformConstBuffer(&constBlock, sizeof(constBlock));

VkDescriptorSetLayout descrSetLayout{};
VkDescriptorSetLayoutBinding bindings[] = {
Expand All @@ -1065,7 +1059,6 @@ void copy_buffer_to_image(const VBuffer& buffer, VImage& image) {

image.bindStorageImage(descrSet, 0);
buffer.bind(descrSet, 1);
constBuffer.bind(descrSet, 2);
WorkGroupSize workGroupSize{8, 8, 1};

auto& computeUnit = context().computeUnitFactory().get(
Expand Down Expand Up @@ -1097,12 +1090,6 @@ void copy_image_to_buffer(
TORCH_INTERNAL_ASSERT(
buffer.sizeBytes() >= image.capacityBytes(),
"VulkanBuffer's capacity is less than VulkanImage capacity to copy from");
struct ConstBlock {
int32_t w;
int32_t h;
};
const ConstBlock constBlock{image.w(), image.h()};
VBuffer constBuffer = makeUniformConstBuffer(&constBlock, sizeof(constBlock));

VkDescriptorSetLayout descrSetLayout{};
const VkDescriptorSetLayoutBinding bindings[] = {
Expand All @@ -1125,7 +1112,6 @@ void copy_image_to_buffer(

image.bindShaderRead(descrSet, 0);
buffer.bind(descrSet, 1);
constBuffer.bind(descrSet, 2);

const WorkGroupSize workGroupSize{8, 8, 1};
auto& computeUnit = context().computeUnitFactory().get(
Expand Down
72 changes: 7 additions & 65 deletions aten/src/ATen/native/vulkan/VulkanOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,10 @@ void upsample_nearest2d(
auto physicalDevice = context().physicalDevice();
int64_t C = IN * IC;
struct ConstBlock {
int32_t IW;
int32_t IH;
int32_t OW;
int32_t OH;
float scaleX;
float scaleY;
};
ConstBlock cb{safe_downcast<int32_t>(IW),
safe_downcast<int32_t>(IH),
safe_downcast<int32_t>(OW),
safe_downcast<int32_t>(OH),
scaleW,
ConstBlock cb{scaleW,
scaleH};
VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));

Expand Down Expand Up @@ -113,17 +105,6 @@ void adaptive_avg_pool2d(
const int64_t IC) {
auto device = context().device();
int64_t C = IN * IC;
struct ConstBlock {
int32_t IW;
int32_t IH;
int32_t OW;
int32_t OH;
};
ConstBlock cb{safe_downcast<int32_t>(IW),
safe_downcast<int32_t>(IH),
safe_downcast<int32_t>(OW),
safe_downcast<int32_t>(OH)};
VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));

VkDescriptorSetLayout descriptorSetLayout{};
VkDescriptorPool descriptorPool{};
Expand All @@ -141,7 +122,6 @@ void adaptive_avg_pool2d(

output.image()->bindStorageImage(descriptorSet, 0);
input.image()->bindShaderRead(descriptorSet, 1);
constBuffer.bind(descriptorSet, 2);

WorkGroupSize workGroupSize{8, 8, 1};
auto& computeUnit = context().computeUnitFactory().get(
Expand Down Expand Up @@ -240,20 +220,14 @@ void avg_pool2d(
auto device = context().device();
const auto c = _n * _c;
struct ConstBlock {
int32_t inputSize[4];
int32_t outputSize[4];
int32_t kernelSize[2];
int32_t stride[2];
int32_t padding[2];
int32_t dilate[2];
};
ConstBlock cb{
{iW, iH, c, 0},
{oW, oH, c, 0},
{kW, kH},
{dW, dH},
{padW, padH},
{1, 1},
};
VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));

Expand Down Expand Up @@ -505,15 +479,9 @@ void add(
auto device = context().device();
auto physicalDevice = context().physicalDevice();
struct ConstBlock {
int32_t W;
int32_t H;
int32_t C;
float alpha;
};
ConstBlock cb{safe_downcast<int32_t>(W),
safe_downcast<int32_t>(H),
safe_downcast<int32_t>(C),
alpha};
ConstBlock cb{alpha};
VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));

VkDescriptorSetLayout descriptorSetLayout{};
Expand Down Expand Up @@ -561,13 +529,9 @@ void add(VulkanTensor& output, const VulkanTensor& input, const float s) {

auto device = context().device();
struct ConstBlock {
int32_t inputSize[3];
float s;
};
ConstBlock cb{{safe_downcast<int32_t>(W),
safe_downcast<int32_t>(H),
safe_downcast<int32_t>(C_4)},
s};
ConstBlock cb{s};
VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));

VkDescriptorSetLayout descriptorSetLayout{};
Expand Down Expand Up @@ -612,13 +576,9 @@ void mul(VulkanTensor& output, const VulkanTensor& input, const float s) {

auto device = context().device();
struct ConstBlock {
int32_t inputSize[3];
float s;
};
ConstBlock cb{{safe_downcast<int32_t>(W),
safe_downcast<int32_t>(H),
safe_downcast<int32_t>(C_4)},
s};
ConstBlock cb{s};
VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));

VkDescriptorSetLayout descriptorSetLayout{};
Expand Down Expand Up @@ -1160,19 +1120,10 @@ void clamp(
auto device = context().device();
auto physicalDevice = context().physicalDevice();
struct ConstBlock {
int32_t W;
int32_t H;
int32_t C_4;
//int32_t C;
float min;
float max;
};
ConstBlock cb{safe_downcast<int32_t>(W),
safe_downcast<int32_t>(H),
safe_downcast<int32_t>(C_4),
//safe_downcast<int32_t>(C),
min,
max};
ConstBlock cb{min, max};
VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));

VkDescriptorSetLayout descriptorSetLayout{};
Expand Down Expand Up @@ -1240,19 +1191,10 @@ void addmm(
auto device = context().device();

struct ConstBlock {
int32_t OW;
int32_t OH;
int32_t C_4;
float beta;
float alpha;
int32_t K;
float beta;
};
ConstBlock cb{safe_downcast<int32_t>(OW),
safe_downcast<int32_t>(OH),
safe_downcast<int32_t>(C_4),
beta,
alpha,
safe_downcast<int32_t>(K)};
ConstBlock cb{alpha, beta};
VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));

VkDescriptorSetLayout descriptorSetLayout{};
Expand Down
5 changes: 5 additions & 0 deletions aten/src/ATen/native/vulkan/VulkanRegisterOpContextClass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ namespace at {
namespace native {
namespace vulkan {

#ifndef USE_VULKAN_API

using detail::convolution2d::createConv2dClampPrePackOpContext;

TORCH_LIBRARY(vulkan, m) {
Expand Down Expand Up @@ -49,6 +51,9 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, CPU, m) {
TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
m.impl("conv2d_clamp_run", detail::convolution2d::conv2d_clamp_run);
}

#endif /* USE_VULKAN_API */

} // namespace vulkan
} // namespace native
} // namespace at
6 changes: 3 additions & 3 deletions aten/src/ATen/native/vulkan/api/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,13 +264,13 @@ void Command::Buffer::dispatch(

vkCmdDispatch(
command_buffer_,
div_up(
utils::div_up(
global_work_group.width,
bound_.pipeline.local_work_group.width),
div_up(
utils::div_up(
global_work_group.height,
bound_.pipeline.local_work_group.height),
div_up(
utils::div_up(
global_work_group.depth,
bound_.pipeline.local_work_group.depth));
}
Expand Down
7 changes: 7 additions & 0 deletions aten/src/ATen/native/vulkan/api/Resource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,14 @@ Resource::Pool::~Pool() {
try {
purge();
}
catch (const std::exception& e) {
LOG(WARNING)
<< "Vulkan: Resource pool destructor raised an exception! Error: "
<< e.what();
}
catch (...) {
LOG(WARNING)
<< "Vulkan: Resource pool destructor raised an unknown exception!";
}
}

Expand Down
55 changes: 55 additions & 0 deletions aten/src/ATen/native/vulkan/api/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,68 @@ namespace at {
namespace native {
namespace vulkan {
namespace api {
namespace utils {

inline uint32_t div_up(
const uint32_t numerator,
const uint32_t denominator) {
return (numerator + denominator - 1u) / denominator;
}

inline VkFormat convert(const caffe2::TypeMeta dtype) {
switch (c10::typeMetaToScalarType(dtype)) {
case kFloat:
#ifdef VULKAN_FP16_INFERENCE
return VK_FORMAT_R16G16B16A16_SFLOAT;
#else
return VK_FORMAT_R32G32B32A32_SFLOAT;
#endif /* VULKAN_FP16_INFERENCE */

default:
TORCH_CHECK(
false,
"Vulkan tensor format not supported!");
}

return VK_FORMAT_UNDEFINED;
}

namespace detail {

template <typename To, typename From>
inline constexpr To safe_downcast(const From v) {
typedef std::common_type_t<From, To> Type;
constexpr Type min{static_cast<Type>(std::numeric_limits<To>::lowest())};
constexpr Type max{static_cast<Type>(std::numeric_limits<To>::max())};
TORCH_CHECK(min <= v && v <= max, "Cast failed: out of range!");
return static_cast<To>(v);
}

template <typename To, typename From>
inline constexpr bool is_signed_to_unsigned() {
return std::is_signed<From>::value && std::is_unsigned<To>::value;
}

} // namespace detail

template <
typename To,
typename From,
std::enable_if_t<detail::is_signed_to_unsigned<To, From>(), bool> = true>
inline constexpr To safe_downcast(const From v) {
TORCH_CHECK(v >= From{}, "Cast failed: negative signed to unsigned!");
return detail::safe_downcast<To, From>(v);
}

template <
typename To,
typename From,
std::enable_if_t<!detail::is_signed_to_unsigned<To, From>(), bool> = true>
inline constexpr To safe_downcast(const From v) {
return detail::safe_downcast<To, From>(v);
}

} // namespace utils
} // namespace api
} // namespace vulkan
} // namespace native
Expand Down
54 changes: 26 additions & 28 deletions aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -6,40 +6,38 @@ layout(std430) uniform;

/* Qualifiers: layout - storage - precision - memory */

layout(set = 0, binding = 0, rgba16f) uniform PRECISION writeonly image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform restrict Block {
int IW;
int IH;
int OW;
int OH;
} uBlock;
layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;

layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;

void main() {
ivec3 pos = ivec3(gl_GlobalInvocationID);
int ow = uBlock.OW;
int oh = uBlock.OH;
if (pos.x < ow && pos.y < oh) {
int iw = uBlock.IW;
int ih = uBlock.IH;

int sx = int(floor(float(pos.x * iw) / ow));
int sy = int(floor(float(pos.y * ih) / oh));
int ex = int(ceil(float((pos.x + 1) * iw) / ow));
int ey = int(ceil(float((pos.y + 1) * ih) / oh));

vec4 r = vec4(1.0) / float(ex - sx) / float(ey - sy);
vec4 acc = vec4(0);

int xi, yi;
for (xi = sx; xi < ex; ++xi) {
for (yi = sy; yi < ey; ++yi) {
acc += texelFetch(uInput, ivec3(xi, yi, pos.z), 0);
const ivec3 pos = ivec3(gl_GlobalInvocationID);

/* Dynamically Uniform */
const ivec3 size = imageSize(uOutput);
const vec3 isize = textureSize(uInput, 0);
const vec2 stride = isize.xy / size.xy;
const vec2 kernel = isize.xy - (size.xy - 1) * stride;

if (all(lessThan(pos, size))) {
const vec2 ipos = pos.xy * stride;

const ivec2 start = ivec2(ipos);
const ivec2 end = ivec2(ceil(ipos + kernel));
const ivec2 range = end - start;

vec4 sum = vec4(0);

for (int y = start.y; y < end.y; ++y) {
for (int x = start.x; x < end.x; ++x) {
sum += texelFetch(uInput, ivec3(x, y, pos.z), 0);
}
}

imageStore(uOutput, pos, r * acc);
imageStore(
uOutput,
pos,
sum / (range.x * range.y));
}
}