Fix allgather to be compatible with openxla allgather tuple change without token

jeffhataws · jeffhataws · commit ec27f90df0e5 · 2023-11-27T17:26:36.000Z
diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py
@@ -594,8 +594,8 @@ def all_gather(value, dim=0, groups=None, output=None, pin_layout=True):
   result = torch_xla._XLAC._xla_all_gather_coalesced(value, token, dim,
                                                      shard_count, groups or [],
                                                      pin_layout)
-  torch_xla._XLAC._set_all_reduce_token(devctx.device, result[-1])
-  return result[:-1]
+  torch_xla._XLAC._set_all_reduce_token(devctx.device, result[1])
+  return result[0]
 
 
 def all_to_all(value,
diff --git a/torch_xla/csrc/cross_replica_reduces.cpp b/torch_xla/csrc/cross_replica_reduces.cpp
@@ -210,23 +210,18 @@ AllToAllResult BuildAllToAll(xla::XlaOp input, xla::XlaOp token,
   return {reduce_result, token_handler.GetNewToken(reduce_result)};
 }
 
-std::vector<xla::XlaOp> BuildAllGather(
+AllGatherResult BuildAllGather(
     absl::Span<const xla::XlaOp> inputs, xla::XlaOp token, int64_t dim,
     int64_t shard_count, const std::vector<std::vector<int64_t>>& groups,
     bool pin_layout) {
   std::vector<xla::ReplicaGroup> cc_groups = CreateReduceGroups(groups);
+  TokenHandler token_handler(token);
   // TODO: We use pseudo-tokens ATM, which are real values. This need to be
   // switched to use the real XLA Token once support has been added to XLA
   // AllGather().
-  xla::XlaOp chained_token = token;
   ReduceContext cc_ctx = GetReduceContext(inputs);
   std::vector<xla::XlaOp> result(inputs.size());
   for (auto& type_ctx : cc_ctx.contexts) {
-    xla::XlaOp token_op = MaybeConvertTo(chained_token, type_ctx.first);
-    type_ctx.second.ops.push_back(token_op);
-    type_ctx.second.operand_shapes.push_back(
-        ShapeHelper::ShapeOfXlaOp(token_op));
-
     xla::XlaOp all_gather_result;
     if (pin_layout) {
       all_gather_result = xla::AllGather(
@@ -239,16 +234,17 @@ std::vector<xla::XlaOp> BuildAllGather(
           xla::AllGather(xla::Tuple(inputs[0].builder(), type_ctx.second.ops),
                          dim, shard_count, cc_groups);
     }
-    for (size_t i = 0; i < type_ctx.second.indices.size(); ++i) {
-      size_t op_idx = type_ctx.second.indices[i];
-      result[op_idx] = xla::GetTupleElement(all_gather_result, i);
+    if (type_ctx.second.indices.size() > 1) { 
+      for (size_t i = 0; i < type_ctx.second.indices.size(); ++i) {
+        size_t op_idx = type_ctx.second.indices[i];
+        result[op_idx] = xla::GetTupleElement(all_gather_result, i);
+      }
+    }
+    else {
+      result[0] = all_gather_result;
     }
-    chained_token =
-        xla::GetTupleElement(all_gather_result, type_ctx.second.indices.size());
   }
-  result.push_back(
-      MaybeConvertTo(chained_token, XlaHelpers::TypeOfXlaOp(token)));
-  return result;
+  return {result, token_handler.GetNewToken(result[0])};
 }
 
 CollectivePermuteResult BuildCollectivePermute(
diff --git a/torch_xla/csrc/cross_replica_reduces.h b/torch_xla/csrc/cross_replica_reduces.h
@@ -25,6 +25,11 @@ struct AllToAllResult {
   xla::XlaOp token;
 };
 
+struct AllGatherResult {
+  std::vector<xla::XlaOp> result;
+  xla::XlaOp token;
+};
+
 struct CollectivePermuteResult {
   xla::XlaOp result;
   xla::XlaOp token;
@@ -40,6 +45,11 @@ struct RecvResult {
   xla::XlaOp token;
 };
 
+struct ReduceScatterResult {
+  std::vector<xla::XlaOp> result;
+  xla::XlaOp token;
+};
+
 std::vector<xla::XlaOp> BuildAllReduce(
     AllReduceType reduce_type, absl::Span<const xla::XlaOp> operands,
     xla::XlaOp token, double scale,
@@ -51,7 +61,7 @@ AllToAllResult BuildAllToAll(xla::XlaOp input, xla::XlaOp token,
                              const std::vector<std::vector<int64_t>>& groups,
                              bool pin_layout);
 
-std::vector<xla::XlaOp> BuildAllGather(
+AllGatherResult BuildAllGather(
     absl::Span<const xla::XlaOp>, xla::XlaOp token, int64_t dim,
     int64_t shard_count, const std::vector<std::vector<int64_t>>& groups,
     bool pin_layout);
@@ -66,6 +76,7 @@ SendResult BuildSendWithToken(xla::XlaOp input, xla::XlaOp token,
 RecvResult BuildRecvWithToken(xla::XlaOp token, const xla::Shape& recv_shape,
                               int64_t channel_id);
 
+//ReduceScatterResult BuildReduceScatter(
 std::vector<xla::XlaOp> BuildReduceScatter(
     AllReduceType reduce_type, absl::Span<const xla::XlaOp> inputs,
     xla::XlaOp token, double scale, int64_t scatter_dim, int64_t shard_count,
diff --git a/torch_xla/csrc/ops/all_gather.cpp b/torch_xla/csrc/ops/all_gather.cpp
@@ -16,17 +16,23 @@ xla::Shape NodeOutputShape(c10::ArrayRef<torch::lazy::Value> inputs,
                            const std::vector<std::vector<int64_t>>& groups,
                            bool pin_layout) {
   auto shape_fn = [&](absl::Span<const xla::XlaOp> operands) -> xla::XlaOp {
-    std::vector<xla::XlaOp> result =
+    AllGatherResult result =
         BuildAllGather(operands.subspan(0, operands.size() - 1),
                        operands.back(), dim, shard_count, groups, pin_layout);
-    return xla::Tuple(operands[0].builder(), result);
+    std::vector<xla::XlaOp> outputs;
+    for (size_t i = 0; i < result.result.size(); ++i) {
+      outputs.emplace_back(result.result[i]);
+    }
+    outputs.emplace_back(result.token);
+    return xla::Tuple(operands[0].builder(), outputs);
   };
   std::vector<xla::Shape> input_shapes;
   for (const auto& input : inputs) {
     input_shapes.emplace_back(GetXlaShape(input));
   }
   input_shapes.emplace_back(GetXlaShape(token));
   return InferOutputShape(input_shapes, shape_fn);
+
 }
 
 }  // namespace
@@ -61,9 +67,10 @@ XlaOpVector AllGather::Lower(LoweringContext* loctx) const {
     inputs.push_back(loctx->GetOutputOp(operand_list[i]));
   }
   xla::XlaOp token = loctx->GetOutputOp(operand_list.back());
-  return ReturnOps(
-      BuildAllGather(inputs, token, dim_, shard_count_, groups_, pin_layout_),
-      loctx);
+  AllGatherResult result =
+      BuildAllGather(inputs, token, dim_, shard_count_, groups_, pin_layout_);
+  result.result.push_back(result.token);
+  return ReturnOps(result.result, loctx);
 }
 
 std::string AllGather::ToString() const {