Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Layout field to Conv and Pool nodes and remove OCL specific versions #3367

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
41 changes: 0 additions & 41 deletions docs/NewBackendSpecificNode.md
Expand Up @@ -48,47 +48,6 @@ ReLU is max between zero and the input value. Glow lowers `ReLUNode` to two basi

Please refer to the document in [Backend](https://github.com/pytorch/glow/blob/master/docs/Backends.md#backend-specific-nodes-and-instructions) part for source code details on adding a new backend-specific CPUMaxSplatNode on CPU.

#### Data Layout Transformation for Conv Operator in OpenCL

OpenCL Conv is faster in layout `NCHW`, but the default layout of convolution operator in Glow is `NHWC`. So we transpose the inputs/output and replace the `ConvolutionNode` with a backend-specific `OCLConvolutionNode` that uses `NCHW`. The transposes mostly can get optimized away thanks to the high-level graph optimizations.

The OpenCL backend defines `OCLConvolution` in `tools/ClassGen/OpenCL/OpenCLSpecificNodes.h` to support layout `NCHW` input.

```cpp
BB.newNode("OCLConvolution")
.addInput("Input")
.addInput("Filter")
.addInput("Bias")
.addMember(MemberType::VectorUnsigned, "Kernels")
.addMember(MemberType::VectorUnsigned, "Strides")
.addMember(MemberType::VectorUnsigned, "Pads")
.addMember(MemberType::Unsigned, "Group")
.addResultFromCtorArg()
.setDocstring(
"This is an OpenCL-specific convolution implementation where the "
"filter, the bias and the input are in the NCHW format");
```

During `transformPostLowering()`, this `convertConvToNCHWConv` node which contains a `NCHWConvNode` node and multiple`Transpose` nodes for `Input`, `Filter` and `Result` replaces the aforementioned pattern.

A corresponding backend-specific `OCLConvolution` instruction is also needed, defined in
`tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h`:

```cpp
BB.newBackendSpecificInstr("OCLConvolution")
.addOperand("Dest", OperandKind::Out)
.addOperand("Src", OperandKind::In)
.addOperand("Filter", OperandKind::In)
.addOperand("Bias", OperandKind::In)
.addMember(MemberType::VectorUnsigned, "Kernels")
.addMember(MemberType::VectorUnsigned, "Strides")
.addMember(MemberType::VectorUnsigned, "Pads")
.addMember(MemberType::Unsigned, "Group")
.autoIRGen()
.autoVerify(VerifyKind::SameElementType, {"Dest", "Src", "Filter", "Bias"});

```


### References

Expand Down
38 changes: 28 additions & 10 deletions include/glow/Backends/LayoutConverter.h
Expand Up @@ -23,7 +23,6 @@ namespace glow {

/// Convert regular convolution nodes (that use NHWC) into a backend-specific
/// convolution nodes using NCHW.
template <class NCHWConvNode>
Node *convertConvToNCHWConv(ConvolutionNode *CN, Function *F) {
// Convert filter and input from NHWC (Glow's default) into NCHW.
auto *NI = F->createTranspose("conv.input", CN->getInput(), NHWC2NCHW);
Expand All @@ -34,30 +33,49 @@ Node *convertConvToNCHWConv(ConvolutionNode *CN, Function *F) {
auto outTy = F->getParent()->uniqueTypeWithNewShape(CN->getResult().getType(),
dimsNCHW);

auto *NC = F->addNode(new NCHWConvNode(
CN->getName(), outTy, NI, NF, CN->getBias(), CN->getKernels(),
CN->getStrides(), CN->getPads(), CN->getGroup(), CN->getDilation()));
auto *NC = F->addNode(
new ConvolutionNode(CN->getName(), outTy, NI, NF, CN->getBias(),
CN->getKernels(), CN->getStrides(), CN->getPads(),
CN->getGroup(), CN->getDilation(), NCHW));
auto *NR = F->createTranspose("conv.result", NC, NCHW2NHWC);

return NR;
}

/// Convert regular pool nodes (that use NHWC) into backend-specific nodes using
/// NCHW.
template <class PoolNode, class NCHWPoolNode>
Node *convertPoolToNCHWPool(PoolNode *PN, Function *F) {
Node *convertMaxPoolToNCHWPool(MaxPoolNode *PN, Function *F) {
// Convert input from NHWC (Glow's default) into NCHW.
auto *NI = F->createTranspose("conv.input", PN->getInput(), NHWC2NCHW);
auto *NI = F->createTranspose("maxpool.input", PN->getInput(), NHWC2NCHW);

auto dimsNHWC = ShapeNHWC(PN->getResult().getType()->dims());
auto dimsNCHW = {dimsNHWC.n, dimsNHWC.c, dimsNHWC.h, dimsNHWC.w};
auto outTy = F->getParent()->uniqueTypeWithNewShape(PN->getResult().getType(),
dimsNCHW);
auto AMT = F->getParent()->uniqueTypeWithNewShape(PN->getArgmax().getType(),
dimsNCHW);

auto *NPN = F->addNode(new MaxPoolNode(PN->getName(), outTy, AMT, NI,
PN->getKernels(), PN->getStrides(),
PN->getPads(), NCHW));
auto *NR = F->createTranspose("maxpool.result", NPN->getResult(), NCHW2NHWC);

return NR;
}

Node *convertAvgPoolToNCHWPool(AvgPoolNode *PN, Function *F) {
// Convert input from NHWC (Glow's default) into NCHW.
auto *NI = F->createTranspose("maxpool.input", PN->getInput(), NHWC2NCHW);

auto dimsNHWC = ShapeNHWC(PN->getResult().getType()->dims());
auto dimsNCHW = {dimsNHWC.n, dimsNHWC.c, dimsNHWC.h, dimsNHWC.w};
auto outTy = F->getParent()->uniqueTypeWithNewShape(PN->getResult().getType(),
dimsNCHW);

auto *NPN =
F->addNode(new NCHWPoolNode(PN->getName(), outTy, NI, PN->getKernels()[0],
PN->getStrides()[0], PN->getPads()));
auto *NR = F->createTranspose("maxpool.result", NPN, NCHW2NHWC);
F->addNode(new AvgPoolNode(PN->getName(), outTy, NI, PN->getKernels(),
PN->getStrides(), PN->getPads(), NCHW));
auto *NR = F->createTranspose("avgpool.result", NPN->getResult(), NCHW2NHWC);

return NR;
}
Expand Down
56 changes: 32 additions & 24 deletions include/glow/Graph/Graph.h
Expand Up @@ -341,14 +341,15 @@ class Function final : public Named {
/// \p group defines the number of groups the input and output channels should
/// be divided into and convolved separately. \p dilation defines factor by
/// which gap between 2 neighboring kernel elements is expanded along each
/// axis.
/// axis. \p layout defines the Tensor layout and must be either NHWC or NCHW.

ConvolutionNode *createConv(llvm::StringRef name, NodeValue input,
NodeValue filter, NodeValue bias, TypeRef outTy,
llvm::ArrayRef<unsigned_t> kernels,
llvm::ArrayRef<unsigned_t> strides,
llvm::ArrayRef<unsigned_t> pads, unsigned_t group,
unsigned_t dilation = 1);
ConvolutionNode *
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update doxygen with the new param everywhere.

createConv(llvm::StringRef name, NodeValue input, NodeValue filter,
NodeValue bias, TypeRef outTy, llvm::ArrayRef<unsigned_t> kernels,
llvm::ArrayRef<unsigned_t> strides,
llvm::ArrayRef<unsigned_t> pads, unsigned_t group,
unsigned_t dilation = 1,
ConvolutionLayout layout = ConvolutionLayout::NHWC);

/// Creates a ConvolutionNode with the given \p name which convolves the 4D
/// \p input with \p filter and \bias. \p kernel defines the size of the
Expand All @@ -358,13 +359,14 @@ class Function final : public Named {
/// \p group defines the number of groups the input and output channels should
/// be divided into and convolved separately. \p dilation defines factor by
/// which gap between 2 neighboring kernel elements is expanded along each
/// axis.
/// axis. \p layout defines the Tensor layout and must be either NHWC or NCHW.

ConvolutionNode *createConv(llvm::StringRef name, NodeValue input,
NodeValue filter, NodeValue bias, TypeRef outTy,
unsigned_t kernel, unsigned_t stride,
unsigned_t pad, unsigned_t group,
unsigned_t dilation = 1);
ConvolutionNode *
createConv(llvm::StringRef name, NodeValue input, NodeValue filter,
NodeValue bias, TypeRef outTy, unsigned_t kernel,
unsigned_t stride, unsigned_t pad, unsigned_t group,
unsigned_t dilation = 1,
ConvolutionLayout layout = ConvolutionLayout::NHWC);

/// Creates a Convolution3DNode with the given \p name which convolves the 5D
/// \p input with \p filter and \bias. \p kernels defines the size of the
Expand Down Expand Up @@ -405,8 +407,9 @@ class Function final : public Named {
/// cells should be added to the input during convolution. \p group defines
/// the number of groups the input and output channels should be divided into
/// and convolved separately.
/// NOTE: ChannelwiseQuantizedConvolutionNode does not yet have an
/// implementation so attempting to run a graph containing this node fails.
/// NOTE: ChannelwiseQuantizedConvolutionNode does
/// not yet have an implementation so attempting to run a graph containing
/// this node fails.
ChannelwiseQuantizedConvolutionNode *createChannelwiseQuantizedConv(
llvm::StringRef name, NodeValue input, Constant *filter, Constant *bias,
Constant *scales, Constant *offsets, TypeRef outTy,
Expand All @@ -419,25 +422,28 @@ class Function final : public Named {
MaxPoolNode *createMaxPool(llvm::StringRef name, NodeValue input,
llvm::ArrayRef<unsigned_t> kernels,
llvm::ArrayRef<unsigned_t> strides,
llvm::ArrayRef<unsigned_t> pads);
llvm::ArrayRef<unsigned_t> pads,
ConvolutionLayout layout = NHWC);

MaxPoolNode *createMaxPool(llvm::StringRef name, NodeValue input,
unsigned_t kernel, unsigned_t stride,
unsigned_t pad);
unsigned_t pad, ConvolutionLayout layout = NHWC);

AvgPoolNode *createAvgPool(llvm::StringRef name, NodeValue input,
llvm::ArrayRef<unsigned_t> kernels,
llvm::ArrayRef<unsigned_t> strides,
llvm::ArrayRef<unsigned_t> pads);
llvm::ArrayRef<unsigned_t> pads,
ConvolutionLayout layout = NHWC);

AvgPoolNode *createAvgPool(llvm::StringRef name, NodeValue input,
TypeRef outTy, llvm::ArrayRef<unsigned_t> kernels,
llvm::ArrayRef<unsigned_t> strides,
llvm::ArrayRef<unsigned_t> pads);
llvm::ArrayRef<unsigned_t> pads,
ConvolutionLayout layout = NHWC);

AvgPoolNode *createAvgPool(llvm::StringRef name, NodeValue input,
unsigned_t kernel, unsigned_t stride,
unsigned_t pad);
unsigned_t pad, ConvolutionLayout layout = NHWC);

/// Creates and \returns an AdaptiveAvgPool node with \p name, \p input, and
/// \p outTy. The AdaptiveAvgPoolNode will perform average pooling over the
Expand Down Expand Up @@ -1100,14 +1106,15 @@ class Function final : public Named {
/// defines the number of groups the input and output channels should be
/// divided into and convolved separately. \p dilation defines factor by
/// which gap between 2 neighboring kernel elements is expanded along each
/// axis.
/// axis. \p layout defines the Tensor layout and must be either NHWC or NCHW.
ConvolutionNode *createConv(PlaceholderBindings &bindings,
llvm::StringRef name, NodeValue input,
size_t outChannels,
llvm::ArrayRef<unsigned_t> kernels,
llvm::ArrayRef<unsigned_t> strides,
llvm::ArrayRef<unsigned_t> pads, unsigned_t group,
unsigned_t dilation = 1);
unsigned_t dilation = 1,
ConvolutionLayout layout = NHWC);

/// Creates a ConvolutionNode with the given \p name which convolves the 4D
/// \p input. \p kernel defines the size of the height and width dimensions of
Expand All @@ -1117,12 +1124,13 @@ class Function final : public Named {
/// defines the number of groups the input and output channels should be
/// divided into and convolved separately.\p dilation defines factor by
/// which gap between 2 neighboring kernel elements is expanded along each
/// axis.
/// axis. \p layout defines the Tensor layout and must be either NHWC or NCHW.
ConvolutionNode *createConv(PlaceholderBindings &bindings,
llvm::StringRef name, NodeValue input,
size_t outChannels, unsigned_t kernel,
unsigned_t stride, unsigned_t pad,
unsigned_t group, unsigned_t dilation = 1);
unsigned_t group, unsigned_t dilation = 1,
ConvolutionLayout layout = NHWC);

/// Creates a Convolution3DNode with the given \p name which convolves the 5D
/// \p input. \p kernels defines the size of the height, width, and depth
Expand Down
3 changes: 3 additions & 0 deletions include/glow/Graph/Nodes.h
Expand Up @@ -203,6 +203,9 @@ inline ShapeHWD calculate3DConvPoolOutputDims(
/// Modes of the padding operation.
enum PaddingMode { CONSTANT = 0, REFLECT, EDGE };

/// Convolution Layouts.
enum ConvolutionLayout { NHWC = 0, NCHW };
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since this is applicable to pools as well, i could see this as a TensorLayout.
Slight downside in that case is that TensorLayout might be too generic and overloaded term.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think TensorLayout is too general, but can't think of a better name :/


/// Support for hashing the Nodes. This is required for using
/// llvm::hash_combine.
class Node;
Expand Down
11 changes: 7 additions & 4 deletions include/glow/IR/IRBuilder.h
Expand Up @@ -52,13 +52,16 @@ class IRBuilder {
/// @name High-level, operation-level IRBuilder.
///@{

MaxPoolWithArgmaxInst *createMaxPoolWithArgmaxOp(
llvm::StringRef name, Value *input, llvm::ArrayRef<unsigned_t> kernels,
llvm::ArrayRef<unsigned_t> strides, llvm::ArrayRef<unsigned_t> pads);
MaxPoolWithArgmaxInst *
createMaxPoolWithArgmaxOp(llvm::StringRef name, Value *input,
llvm::ArrayRef<unsigned_t> kernels,
llvm::ArrayRef<unsigned_t> strides,
llvm::ArrayRef<unsigned_t> pads, unsigned_t layout);

AvgPoolInst *createAvgPoolOp(Value *input, llvm::ArrayRef<unsigned_t> kernels,
llvm::ArrayRef<unsigned_t> strides,
llvm::ArrayRef<unsigned_t> pads);
llvm::ArrayRef<unsigned_t> pads,
unsigned_t layout);

CrossEntropyLossInst *createCrossEntropyLossOp(llvm::StringRef name, Value *P,
Value *labels);
Expand Down
7 changes: 7 additions & 0 deletions lib/Backends/Interpreter/InterpreterNodes.cpp
Expand Up @@ -282,6 +282,8 @@ void BoundInterpreterFunction::fwdConvolutionInstQuantizedImpl(
}

void BoundInterpreterFunction::fwdConvolutionInst(const ConvolutionInst *I) {
assert(I->getLayout() == NHWC &&
"Glow Interpreter supports only NHWC Convolutions");
auto kernelSizes = I->getKernels();
auto pads = I->getPads();
auto strides = I->getStrides();
Expand All @@ -303,6 +305,8 @@ void BoundInterpreterFunction::fwdConvolutionInst(const ConvolutionInst *I) {

void BoundInterpreterFunction::fwdConvolutionGradInst(
const ConvolutionGradInst *I) {
assert(I->getLayout() == NHWC &&
"Glow Interpreter supports only NHWC Convolutions");
auto inW = getWeightHandle(I->getSrc());
auto inG = getWeightHandle(I->getSrcGrad());
auto outG = getWeightHandle(I->getDestGrad());
Expand Down Expand Up @@ -753,6 +757,7 @@ static void fwdMaxPool(Tensor *inW, Tensor *outW, Tensor *argmaxW,
}

void BoundInterpreterFunction::fwdMaxPoolInst(const MaxPoolInst *I) {
assert(I->getLayout() == NHWC && "Glow Interpreter supports only NHWC Pools");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same checks for gradient version?

auto inW = getTensor(I->getSrc());
auto outW = getTensor(I->getDest());

Expand All @@ -770,6 +775,7 @@ void BoundInterpreterFunction::fwdMaxPoolInst(const MaxPoolInst *I) {

void BoundInterpreterFunction::fwdMaxPoolWithArgmaxInst(
const MaxPoolWithArgmaxInst *I) {
assert(I->getLayout() == NHWC && "Glow Interpreter supports only NHWC Pools");
auto inW = getTensor(I->getSrc());
auto outW = getTensor(I->getDest());
auto argmaxW = getTensor(I->getArgmax());
Expand Down Expand Up @@ -888,6 +894,7 @@ void BoundInterpreterFunction::fwdAvgPoolInstI8Impl(const AvgPoolInst *I) {
}

void BoundInterpreterFunction::fwdAvgPoolInst(const AvgPoolInst *I) {
assert(I->getLayout() == NHWC && "Glow Interpreter supports only NHWC Pools");
if (I->getSrc()->getType()->isQuantizedType()) {
fwdAvgPoolInstI8Impl(I);
return;
Expand Down