Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Elmiminate SumToSize by using Optional Lists #18697

Open
wants to merge 26 commits into
base: master
from
Open
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4ecb33a
Specialize Optional (Tensor) to None when executing graph
t-vi Mar 24, 2019
431edca
don't unpack NoneType in _unwrap_optional
t-vi Mar 24, 2019
f557aeb
fix worst stuff
t-vi Mar 24, 2019
c032d5a
Merge branch 'master' into optional_None
t-vi Mar 25, 2019
60a3c93
Merge branch 'master' into optional_None
t-vi Apr 1, 2019
131739f
[WIP] JIT: Elmiminate SumToSize by using Optional Lists
t-vi Apr 1, 2019
bf3de65
no dump()
t-vi Apr 1, 2019
3b4acf9
Fixes
t-vi Apr 2, 2019
e18cf4f
incorporate some feedback from eellison, thanks
t-vi Apr 2, 2019
6650d55
Merge branch 'master' into optional_None
t-vi Apr 3, 2019
9cd8ac9
clean up specialize_autogradzero
t-vi Apr 3, 2019
c43e367
add test for optional list
t-vi Apr 4, 2019
00edd17
Merge branch 'optional_None' into optional_list_sts
t-vi Apr 4, 2019
f2bc16a
change size_if_not_equal to take sizes
t-vi Apr 4, 2019
c21b407
this==other is much more interesting then this==this and works better…
t-vi Apr 5, 2019
9a808fa
Merge branch 'master' into optional_list_sts
t-vi Apr 5, 2019
83b5a74
fixes
t-vi Apr 5, 2019
fb45987
Merge branch 'master' into optional_list_sts
t-vi Apr 6, 2019
7f3f1c6
Merge branch 'master' into optional_list_sts
t-vi May 7, 2019
b518e61
Merge remote-tracking branch 'origin/master' into HEAD
May 9, 2019
2042e22
clang format my changes
t-vi May 9, 2019
b7000b0
update TestFuser
t-vi May 9, 2019
5967ad0
Merge branch 'master' into optional_list_sts
t-vi May 9, 2019
2e6e165
add test
t-vi May 9, 2019
fbb2a7f
only one forward multiple backwards to sharpen test
t-vi May 10, 2019
13a6932
formatting typo
t-vi May 10, 2019
File filter...
Filter file types
Jump to…
Jump to file or symbol
Failed to load files and symbols.
+121 −34
Diff settings

Always

Just for now

[WIP] JIT: Elmiminate SumToSize by using Optional Lists

This PR is a proposed alternative to #18120 and would achieve
very similar fusion (in particular for LSTM backward).

It consists of three parts:
- Specialize Non-Tensor Optional inputs to graphs to be
  either of NoneType or of the elementType.
  This needs the graph spec to be different for the two cases.
- In AutoDiff, record broadcasting sizes only if the
  broadcast output size is different from the input size,
  otherwise record None.
- The specialization allows us to eliminate
  _grad_sum_to_size(t, None) in the peephole optimization
  step.

Thus, in the LSTM case, no SumToSize remain in the crucial fusion
group. The trick here is that we can specialize on the runtime
information from the forward.

I label this WIP because I didn't integrate tests yet and because
I didn't move all symbolic_script _grad_sum_to_size to the new
logic.

However, it would be great to have some discussion given that
some eyebrows in implementation details.
  • Loading branch information...
t-vi committed Apr 1, 2019
commit 131739fb3e29303da1283aa51d68c17fff81b5f3
@@ -75,6 +75,7 @@ namespace c10 {
_(prim, max) \
_(prim, rangelist) \
_(aten, _grad_sum_to_size) \
_(aten, _size_if_not_equal) \
_(aten, _ncf_unsqueeze) \
_(aten, warn) \
_(aten, floordiv) \
@@ -43,7 +43,7 @@ struct ArgumentInfo {
return at::ScalarType(type_);
}
operator TypePtr() const {
if (!defined())
if (!defined() && isTensor())
return TensorType::get();
return DimensionedTensorType::create(type(), ConvertIntToCPUOrCUDA(device()), dim());
}
@@ -106,6 +106,9 @@ struct ArgumentSpec {
} else {
// NB: no need to set is_tensor to false, because we memset the struct to
// 0 above
if (! input.isNone()) {
arg.defined_ = true;
}
combineHash(arg);
offset++;
}
@@ -166,6 +169,17 @@ struct ArgumentSpec {
return TupleType::create(fmap(
tuple_type->elements(),
[&](const TypePtr& subtype) { return fillType(subtype, offset); }));
} else if (auto optional_type = original->cast<OptionalType>()) {
// FIXME: let's leave Tensor? untouched, would be obsoleted by #18407
if (original->isSubtypeOf(OptionalType::ofTensor())) {
offset++;
return original;
}
auto& arg = args.at(offset++);
if (! arg.defined()) {
return NoneType::get();
}
return optional_type->getElementType();;
} else {
offset++;
return original;
@@ -241,6 +255,8 @@ struct CompleteArgumentSpec {
std::copy(strides.begin(), strides.end(), next_dim);
next_dim += strides.size();
}
} else {
pod.defined = inputs[i].isNone();
}
// each POD has a running tally of all dimensions including its own
pod.total_dims = total_dims;
@@ -325,7 +341,7 @@ struct CompleteArgumentInfo {
spec.sizes_strides() + sizes_strides_offset(i) + ndim, ndim);
}
operator TypePtr() const {
if (!defined())
if (!defined() && isTensor())
return TensorType::get();
return CompleteTensorType::create(
type(), ConvertIntToCPUOrCUDA(device()), sizes(), strides());
@@ -109,7 +109,8 @@ bool isDifferentiable(Node* n) {
"aten::sinh(Tensor self) -> Tensor",
"aten::tan(Tensor self) -> Tensor",
"aten::trunc(Tensor self) -> Tensor",
"aten::_grad_sum_to_size(Tensor(a) self, int[] size) -> Tensor(a)",
"aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)",
"aten::_size_if_not_equal(Tensor self, Tensor other) -> int[]?",
"aten::log_softmax(Tensor self, int dim) -> Tensor",
"aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] padding, bool ceil_mode, bool count_include_pad) -> Tensor",
"aten::max_pool2d_with_indices(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> (Tensor, Tensor)",
@@ -270,11 +271,11 @@ class GradientHelper {
private:
Node* node;

SymbolicVariable gradSumToSizeOf(SymbolicVariable v, Symbol input_name) {
SymbolicVariable gradSumToSizeOf(SymbolicVariable v, Symbol input_name, SymbolicVariable fw_output) {
Value* size;
{
WithInsertPoint insert_guard{node};
size = SymbolicVariable(node->namedInput(input_name)).size();
WithInsertPoint insert_guard{node->next()};

This comment has been minimized.

Copy link
@wanchaol

wanchaol May 22, 2019

Contributor

why does this guard changed from node to node->next()?

size = SymbolicVariable(node->namedInput(input_name)).size_if_not_equal(fw_output);
}
return v.gradSumToSize(size);
};
@@ -300,9 +301,9 @@ class GradientHelper {

if (node->matches(
"aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor")) {
return {gradSumToSizeOf(grads.at(0), attr::self),
return {gradSumToSizeOf(grads.at(0), attr::self, outputs.at(0)),
gradSumToSizeOf(
grads.at(0) * node->namedInput(attr::alpha), attr::other),
grads.at(0) * node->namedInput(attr::alpha), attr::other, outputs.at(0)),
nullptr};

} else if (
@@ -317,9 +318,9 @@ class GradientHelper {
} else if (
node->matches(
"aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor")) {
return {gradSumToSizeOf(grads.at(0), attr::self),
return {gradSumToSizeOf(grads.at(0), attr::self, outputs.at(0)),
gradSumToSizeOf(
-grads.at(0) * node->namedInput(attr::alpha), attr::other),
-grads.at(0) * node->namedInput(attr::alpha), attr::other, outputs.at(0)),
nullptr};

} else if (
@@ -329,19 +330,19 @@ class GradientHelper {

} else if (node->matches(
"aten::mul(Tensor self, Tensor other) -> Tensor")) {
return {gradSumToSizeOf(grads.at(0) * inputs.at(1), attr::self),
gradSumToSizeOf(grads.at(0) * inputs.at(0), attr::other)};
return {gradSumToSizeOf(grads.at(0) * inputs.at(1), attr::self, outputs.at(0)),
gradSumToSizeOf(grads.at(0) * inputs.at(0), attr::other, outputs.at(0))};

} else if (node->matches(
"aten::mul(Tensor self, Scalar other) -> Tensor")) {
return {grads.at(0) * inputs.at(1), nullptr};

} else if (node->matches(
"aten::div(Tensor self, Tensor other) -> Tensor")) {
return {gradSumToSizeOf(grads.at(0) / inputs.at(1), attr::self),
return {gradSumToSizeOf(grads.at(0) / inputs.at(1), attr::self, outputs.at(0)),
gradSumToSizeOf(
-grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1)),
attr::other)};
attr::other, outputs.at(0))};

} else if (node->matches(
"aten::div(Tensor self, Scalar other) -> Tensor")) {
@@ -352,30 +353,30 @@ class GradientHelper {
return {
gradSumToSizeOf(
grads.at(0) * (inputs.at(0) > inputs.at(1)).type_as(grads.at(0)),
attr::self),
attr::self, outputs.at(0)),
gradSumToSizeOf(
grads.at(0) * (inputs.at(1) > inputs.at(0)).type_as(grads.at(0)),
attr::other)};
attr::other, outputs.at(0))};

} else if (node->matches(
"aten::min(Tensor self, Tensor other) -> Tensor")) {
return {
gradSumToSizeOf(
grads.at(0) * (inputs.at(0) < inputs.at(1)).type_as(grads.at(0)),
attr::self),
attr::self, outputs.at(0)),
gradSumToSizeOf(
grads.at(0) * (inputs.at(1) < inputs.at(0)).type_as(grads.at(0)),
attr::other)};
attr::other, outputs.at(0))};

} else if (
node->matches(
"aten::where(Tensor condition, Tensor self, Tensor other) -> Tensor")) {
return {nullptr,
gradSumToSizeOf(
grads.at(0) * inputs.at(0).type_as(grads.at(0)), attr::self),
grads.at(0) * inputs.at(0).type_as(grads.at(0)), attr::self, outputs.at(0)),
gradSumToSizeOf(
grads.at(0) * (1 - inputs.at(0)).type_as(grads.at(0)),
attr::other)};
attr::other, outputs.at(0))};

} else if (node->matches("aten::sigmoid(Tensor self) -> Tensor")) {
// TODO: The order of operations matter in this case. This
@@ -462,14 +463,19 @@ class GradientHelper {

} else if (
node->matches(
"aten::_grad_sum_to_size(Tensor(a) self, int[] size) -> Tensor(a)")) {
"aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)")) {
Value* self_size;
{
WithInsertPoint insert_guard{node};
self_size = inputs.at(0).size();
}
return {grads.at(0).expand(self_size), nullptr};

} else if (
node->matches(
"aten::_size_if_not_equal(Tensor self, Tensor other) -> int[]?")) {
return {nullptr, nullptr};

} else if (node->matches("aten::ceil(Tensor self) -> Tensor")) {
return {SymbolicVariable::zeros_like(grads.at(0))};

@@ -561,7 +567,7 @@ class GradientHelper {
node->matches(
"aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha) -> Tensor")) {
return {gradSumToSizeOf(
grads.at(0) * node->namedInput(attr::beta), attr::self),
grads.at(0) * node->namedInput(attr::beta), attr::self, outputs.at(0)),
grads.at(0).mm(inputs.at(2).t()) * node->namedInput(attr::alpha),
inputs.at(1).t().mm(grads.at(0)) * node->namedInput(attr::alpha),
nullptr,
@@ -457,10 +457,20 @@ bool Operator::matches(const Node* node) const {
const MatchTypeReturn matched_type =
matchTypeVariables(formals[i].type(), actuals[i]->type(), type_env);
if (!matched_type.type) {
if (actuals[i]->type() == NoneType::get() &&
formals[i].type()->kind() == TypeKind::OptionalType) {
// when looking for a match, None is actually OK here
continue;
}
return false;
}
TypePtr formal = *matched_type.type;
if (!actuals[i]->type()->isSubtypeOf(formal)) {
if (actuals[i]->type() == NoneType::get() &&
formals[i].type()->kind() == TypeKind::OptionalType) {
// when looking for a match, None is actually OK here
continue;
}
return false;
}
}
@@ -388,7 +388,8 @@ struct GraphFuser {
} else if (
(input->type()->isSubtypeOf(FloatType::get()) && input->node()->kind() != prim::Constant) ||
(n->kind() == aten::_grad_sum_to_size &&
input->type()->isSubtypeOf(ListType::ofInts()))) {
(input->type()->isSubtypeOf(ListType::ofInts()) ||
input->type()->isSubtypeOf(NoneType::get())))) {
auto in_group = subgraph.addInput();
in_group->setType(input->type());
inputs_map[input] = in_group;
@@ -398,6 +399,7 @@ struct GraphFuser {
// so we generally don't allow fusing tensor-scalar operations unless
// the scalar is constant. In those cases we inline the constants
// directly in the body of the fused group.
input->node()->dump();
AT_ASSERT(input->node()->kind() == prim::Constant);
Node* in_const =
subgraph.createClone(input->node(), [](Value*) -> Value* {
@@ -61,6 +61,11 @@ void PeepholeOptimizeImpl(Block* block, bool addmm_fusion_enabled) {
self_type->device() == other_type->device()) {
node->output()->replaceAllUsesWith(node->input(0));
}
} else if (node->matches(
"aten::_grad_sum_to_size(Tensor self, int[]? size) -> Tensor")) {
if (node->input(1)->type()->isSubtypeOf(NoneType::get())) {
node->output()->replaceAllUsesWith(node->input(0));
}
} else if (
node->matches(
"aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
@@ -157,11 +162,11 @@ void PeepholeOptimizeImpl(Block* block, bool addmm_fusion_enabled) {
}
} else if (
node->matches(
"aten::_grad_sum_to_size(Tensor(a) self, int[] size) -> Tensor(a)")) {
"aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)")) {

This comment has been minimized.

Copy link
@wanchaol

wanchaol May 22, 2019

Contributor

can you just merge above _grad_sum_to_size case with this one here?

auto uses = node->output()->uses();
for (Use u : uses) {
if (u.user->matches(
"aten::_grad_sum_to_size(Tensor(a) self, int[] size) -> Tensor(a)")) {
"aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)")) {
u.user->replaceInput(0, node->inputs().at(0));
}
}
@@ -513,6 +513,17 @@ class ShapePropagator {
}
return;
}
case prim::unchecked_unwrap_optional: {
// If we have None as input, we need to leave the output type alone
// note that we cannot have None as input in execution,
// but we want to keep the graph consistent during the analysis
if (auto ot = node->input()->type()->cast<OptionalType>()) {
node->output()->setType(ot->getElementType());
} else if (!node->input()->type()->isSubtypeOf(NoneType::get())) {
node->output()->setType(node->input()->type());
}
return;
}
case prim::ConstantChunk: {
Value* tensor = node->input();
if (auto type = tensor->type()->cast<DimensionedTensorType>()) {
@@ -529,10 +540,20 @@ class ShapePropagator {
return;
}
case aten::_unwrap_optional: {
// if we have None as input, we need to leave the output alone
auto input_ivalue = toIValue(node->input());
if (input_ivalue && input_ivalue->isNone()) {
return;
}
// During analysis we don't want to pass None through here
// to not mess up the expectation of the passes working
// with the nodes consuming the output.
if (auto ot = node->input()->type()->cast<OptionalType>()) {
node->output()->setType(ot->getElementType());
} else if (!node->input()->type()->isSubtypeOf(NoneType::get())) {
node->output()->setType(node->input()->type());
}
return;
}
default:
break; // fall-through
@@ -583,12 +583,28 @@ RegisterOperators reg(
};
}),
Operator(
"aten::_grad_sum_to_size(Tensor(a) self, int[] size) -> Tensor(a)",
"aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)",
[](Stack& stack) {
at::Tensor self;
Shared<IntList> desired_sizes;
pop(stack, self, desired_sizes);
push(stack, at::sum_to(std::move(self), desired_sizes->elements()));
IValue self, size;
pop(stack, self, size);
if (size.isNone()) {
push(stack, self);
} else {
push(stack, at::sum_to(self.toTensor(), size.toIntList()->elements()));
}
return 0;
}),
Operator(
"aten::_size_if_not_equal(Tensor self, Tensor other) -> int[]?",
[](Stack& stack) {
at::Tensor self, other;
pop(stack, self, other);
const auto s = self.sizes();
if (s == other.sizes()) {
push(stack, IValue());
} else {
push(stack, s);
}
return 0;
}),
Operator(
@@ -470,14 +470,18 @@ const std::vector<std::string> functions = {
return torch.lerp(self, end, weight), backward
def mul(self, other):
result = self * other
self_size = torch._size_if_not_equal(self, result)
other_size = torch._size_if_not_equal(other, result)
def backward(grad_output):
# self & other are used in backward. No need to pass in their size
# from forward pass
grad_self = (grad_output * other)._grad_sum_to_size(self.size())
grad_other = (grad_output * self)._grad_sum_to_size(other.size())
grad_self = (grad_output * other)._grad_sum_to_size(self_size)
grad_other = (grad_output * self)._grad_sum_to_size(other_size)
return grad_self, grad_other
return self * other, backward
return result, backward
def mv(self, vec):
def backward(grad_output):
@@ -871,7 +875,6 @@ const std::vector<std::string> functions = {
return grad_self, None, None, None, None
return torch.__interpolate(input, size, scale_factor, mode, align_corners), backward
)"};
std::unordered_map<std::string, GradientPair> schema_to_graphs;

@@ -174,6 +174,9 @@ struct SymbolicVariable {
return create(aten::type_as, {*this, rhs})[0].typeLikeWithRhsScalarType(
*this, rhs);
}
SymbolicVariable size_if_not_equal(const SymbolicVariable other) const {
return create(aten::_size_if_not_equal, {*this, other})[0].toType(OptionalType::create(ListType::ofInts()));
}
SymbolicVariable narrow(int dim, int64_t start, int64_t length) const {
return create(
t("narrow"),
@@ -306,6 +309,10 @@ struct SymbolicVariable {
v->setType(other_type->contiguous());
return *this;
}
SymbolicVariable toType(TypePtr type) const {
v->setType(type);
return *this;
}
SymbolicVariable typeLikeWithScalarType(
SymbolicVariable other,
at::ScalarType type) const {
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.