pytorch
diff --git a/‎README.md‎
Lines changed: 6 additions & 5 deletions b/‎README.md‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎codegen/lazy_tensor_generator.py‎
Lines changed: 1 addition & 1 deletion b/‎codegen/lazy_tensor_generator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎codegen/xla_native_functions.yaml‎
Lines changed: 1 addition & 0 deletions b/‎codegen/xla_native_functions.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/gpu.md‎
Lines changed: 19 additions & 7 deletions b/‎docs/gpu.md‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎docs/spmd.md‎
Lines changed: 7 additions & 7 deletions b/‎docs/spmd.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎infra/ansible/config/env.yaml‎
Lines changed: 4 additions & 0 deletions b/‎infra/ansible/config/env.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎infra/tpu-pytorch-releases/artifacts.auto.tfvars‎
Lines changed: 16 additions & 0 deletions b/‎infra/tpu-pytorch-releases/artifacts.auto.tfvars‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 3 additions & 2 deletions b/‎setup.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎test/cpp/BUILD‎
Lines changed: 12 additions & 11 deletions b/‎test/cpp/BUILD‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎test/cpp/cpp_test_util.cpp‎
Lines changed: 1 addition & 1 deletion b/‎test/cpp/cpp_test_util.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -147,10 +147,11 @@ bucket.
 
 | Version | Cloud TPU VMs Wheel |
 | --- | ----------- |
-| 2.1 (CUDA 12.0 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.0/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
+| 2.1 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
 | 2.1 (XRT + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/xrt/tpuvm/torch_xla-2.1.0%2Bxrt-cp310-cp310-manylinux_2_28_x86_64.whl` |
 | nightly (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
 | nightly (Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl` |
+| nightly (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
 
 <details>
 
@@ -230,11 +231,11 @@ This is only required on Cloud TPU VMs.
 
 <br/>
 
-| Version | GPU CUDA 12.0 Docker |
+| Version | GPU CUDA 12.1 Docker |
 | --- | ----------- |
-| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_12.0` |
-| nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.0` |
-| nightly at date | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.0_YYYYMMDD` |
+| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_12.1` |
+| nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1` |
+| nightly at date | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1_YYYYMMDD` |
 
 <br/>
 
 
@@ -115,7 +115,7 @@ def build_ir_node(self, func: NativeFunction, schema: LazyIrSchema) -> str:
       get_tensorlist="GetTensorList",
       get_tensor_or_wrap_number="bridge::GetXlaTensorOrCreateForWrappedNumber",
       try_get_tensor="bridge::TryGetXlaTensor",
-      metrics_counter='TORCH_LAZY_FN_COUNTER("xla::")',
+      metrics_counter='TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::")',
       create_tensor="XLATensor::Create",
       create_aten_from_ltc_tensor="torch_xla::bridge::AtenFromXlaTensor",
       tuple_aten_from_ltc_tensors="torch_xla::bridge::TupleAtenFromXlaTensors",
 
@@ -192,6 +192,7 @@ supported:
   - floor_divide
   - fmod.Scalar
   - fmod.Tensor
+  - full
   - gather
   - gelu
   - gelu_backward
 
@@ -11,14 +11,14 @@ You can either use a local machine with GPU attached or a GPU VM on the cloud. F
 ### Docker
 Pytorch/XLA currently publish prebuilt docker images and wheels with cuda11.7/8 and python 3.8. We recommend users to create a docker container with corresponding config. For a full list of docker images and wheels, please refer to [this doc](https://github.com/pytorch/xla#available-docker-images-and-wheels).
 ```
-sudo docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_11.8
+sudo docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1
 sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent    software-properties-common
 distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
 curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
 curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
 sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
 sudo systemctl restart docker
-sudo docker run --shm-size=16g --net=host --gpus all -it -d us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_11.8 bin/bash
+sudo docker run --shm-size=16g --net=host --gpus all -it -d us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1 bin/bash
 sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash
 ```
 
@@ -49,10 +49,20 @@ Thu Dec  8 06:24:29 2022
 
 ```
 
+### Check environment variable
+
+Make sure `PATH` and `LD_LIBRARY_PATH` environment variables account for cuda. Please do a `echo $PATH` and `echo $LD_LIBRARY_PATH` to verify. If not, please follow [link](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#mandatory-actions) to do so. Example:
+
+```
+echo "export PATH=/usr/local/cuda-12.1/bin${PATH:+:${PATH}}" >> ~/.bashrc
+echo "export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> ~/.bashrc
+source ~/.bashrc
+```
+
 ### Wheel
 ```
-pip3 install torch==2.0
-pip3 install https://storage.googleapis.com/tpu-pytorch/wheels/cuda/117/torch_xla-2.0-cp38-cp38-linux_x86_64.whl
+pip3 install torch==2.1
+pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl
 ```
 
 ## Run a simple model
@@ -80,23 +90,25 @@ AMP is very useful on GPU training and PyTorch/XLA reuse Cuda's AMP rule. You ca
 1. Inside a GPU VM, create a docker container from a development docker image. For example:
 
 ```
-sudo docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_11.8
+sudo docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1
 sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent    software-properties-common
 distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
 curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
 curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
 sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
 sudo systemctl restart docker
-sudo docker run --shm-size=16g --net=host --gpus all -it -d us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_11.8
+sudo docker run --shm-size=16g --net=host --gpus all -it -d us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1
 sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash
 ```
 
 2. Build PyTorch and PyTorch/XLA from source.
 
+Make sure `PATH` and `LD_LIBRARY_PATH` environment variables account for cuda. See the [above](https://github.com/pytorch/xla/blob/master/docs/gpu.md#check-environment-variable) for more info.
+
 ```
 git clone https://github.com/pytorch/pytorch.git
 cd pytorch
-USE_CUDA=0 python setup.py install
+USE_CUDA=1 python setup.py install
 
 git clone https://github.com/pytorch/xla.git
 cd xla
 
@@ -33,7 +33,7 @@ Also, this version of the SPMD is currently only tested.optimized on Google Clou
 
 ### Simple Example & Sharding Aannotation API
 
-Users can annotate native PyTorch tensors using the `mark_sharding` API ([src](https://github.com/pytorch/xla/blob/9a5fdf3920c18275cf7dba785193636f1b39ced9/torch_xla/experimental/xla_sharding.py#L388)). This takes `torch.Tensor` as input and returns a `XLAShardedTensor` as output.
+Users can annotate native PyTorch tensors using the `mark_sharding` API ([src](https://github.com/pytorch/xla/blob/4e8e5511555073ce8b6d1a436bf808c9333dcac6/torch_xla/distributed/spmd/xla_sharding.py#L452)). This takes `torch.Tensor` as input and returns a `XLAShardedTensor` as output.
 
 ```python
 def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh, partition_spec: Tuple[Union[int, None]]) -> XLAShardedTensor
@@ -46,8 +46,8 @@ import numpy as np
 import torch
 import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
-import torch_xla.experimental.xla_sharding as xs
-from torch_xla.experimental.xla_sharding import Mesh
+import torch_xla.distributed.spmd as xs
+from torch_xla.distributed.spmd import Mesh
 
 # Enable XLA SPMD execution mode.
 xr.use_spmd()
@@ -100,11 +100,11 @@ We derive a logical mesh based on this topology to create sub-groups of devices
 
 ![alt_text](assets/mesh_spmd2.png "image_tooltip")
 
-We abstract logical mesh with [Mesh API](https://github.com/pytorch/xla/blob/028df4da388468fa9a41b1f98ea08bfce13b4c63/torch_xla/experimental/xla_sharding.py#L16). The axes of the logical Mesh can be named. Here is an example:
+We abstract logical mesh with [Mesh API](https://github.com/pytorch/xla/blob/4e8e5511555073ce8b6d1a436bf808c9333dcac6/torch_xla/distributed/spmd/xla_sharding.py#L17). The axes of the logical Mesh can be named. Here is an example:
 
 ```python
 import torch_xla.runtime as xr
-from torch_xla.experimental.xla_sharding import Mesh
+from torch_xla.distributed.spmd import Mesh
 
 # Assuming you are running on a TPU host that has 8 devices attached
 num_devices = xr.global_runtime_device_count()
@@ -130,7 +130,7 @@ In general, SPMD programs should create a single mesh and reuse it for all shard
 Mesh nicely abstracts how the physical device mesh is constructed. Users can arrange devices in any shape and order using the logical mesh. However, one can define a more performant mesh based on the physical topology, especially when it involves Data Center Network (DCN) cross slice connections. HybridMesh creates a mesh which gives good performance out of the box for such multislice environments. It accepts ici\_mesh\_shape and dcn\_mesh\_shape which denote logical mesh shapes of inner and outer network.
 
 ```python
-from torch_xla.experimental.xla_sharding import HybridMesh
+from torch_xla.distributed.spmd import HybridMesh
 
 # This example is assuming 2 slices of v4-8.
 # - ici_mesh_shape: shape of the logical mesh for inner connected devices.
@@ -198,7 +198,7 @@ The main use case for `XLAShardedTensor` [[RFC](https://github.com/pytorch/xla/i
 *   `XLAShardedTensor` is a `torch.Tensor` subclass and works directly with native torch ops and `module.layers`. We use `__torch_dispatch__` to send `XLAShardedTensor` to the XLA backend. PyTorch/XLA retrieves attached sharding annotations to trace the graph and invokes XLA SPMDPartitioner.
 *   Internally, `XLAShardedTensor` (and its global\_tensor input) is backed by `XLATensor` with a special data structure holding references to the sharded device data.
 *   The sharded tensor after lazy execution may be gathered and materialized back to the host as global\_tensor when requested on the host (e.g., printing the value of the global tensor.
-*   The handles to the local shards are materialized strictly after the lazy execution. `XLAShardedTensor` exposes [local\_shards](https://github.com/pytorch/xla/blob/909f28fa4c1a44efcd21051557b3bcf2d399620d/torch_xla/experimental/xla_sharded_tensor.py#L111) to return the local shards on addressable devices as <code>List[[XLAShard](https://github.com/pytorch/xla/blob/909f28fa4c1a44efcd21051557b3bcf2d399620d/torch_xla/experimental/xla_sharded_tensor.py#L12)]</code>.
+*   The handles to the local shards are materialized strictly after the lazy execution. `XLAShardedTensor` exposes [local\_shards](https://github.com/pytorch/xla/blob/4e8e5511555073ce8b6d1a436bf808c9333dcac6/torch_xla/distributed/spmd/xla_sharded_tensor.py#L117) to return the local shards on addressable devices as <code>List[[XLAShard](https://github.com/pytorch/xla/blob/4e8e5511555073ce8b6d1a436bf808c9333dcac6/torch_xla/distributed/spmd/xla_sharded_tensor.py#L12)]</code>.
 
 There is also an ongoing effort to integrate <code>XLAShardedTensor</code> into <code>DistributedTensor</code> API to support XLA backend [[RFC](https://github.com/pytorch/pytorch/issues/92909)].
 
 
@@ -16,6 +16,8 @@ release_env:
   cuda:
     TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0
     XLA_CUDA: 1
+    PATH: /usr/local/cuda-{{ cuda_version }}/bin:/usr/local/nvidia/bin${PATH:+:${PATH}}
+    LD_LIBRARY_PATH: /usr/local/cuda-{{ cuda_version }}/lib64:/usr/local/cuda-{{ cuda_version }}/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}:/usr/local/lib
 
 # Variables that will be passed to shell environment only for building PyTorch and XLA libs.
 build_env:
@@ -44,6 +46,8 @@ build_env:
   cuda:
     TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0
     XLA_CUDA: 1
+    PATH: /usr/local/cuda-{{ cuda_version }}/bin:/usr/local/nvidia/bin${PATH:+:${PATH}}
+    LD_LIBRARY_PATH: /usr/local/cuda-{{ cuda_version }}/lib64:/usr/local/cuda-{{ cuda_version }}/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}:/usr/local/lib
 
   tpu:
     ACCELERATOR: tpu
 
@@ -3,10 +3,18 @@ nightly_package_version = "2.2.0"
 # Built once a day from master.
 nightly_builds = [
   { accelerator = "tpu" },
+  {
+    accelerator    = "tpu"
+    python_version = "3.9"
+  },
   {
     accelerator    = "tpu"
     python_version = "3.10"
   },
+  {
+    accelerator    = "tpu"
+    python_version = "3.11"
+  },
   {
     accelerator  = "cuda"
     cuda_version = "12.1"
@@ -73,6 +81,14 @@ versioned_builds = [
     python_version  = "3.10"
     bundle_libtpu   = "0"
   },
+  {
+    git_tag         = "v2.1.0"
+    pytorch_git_rev = "v2.1.0"
+    package_version = "2.1.0"
+    accelerator     = "tpu"
+    python_version  = "3.11"
+    bundle_libtpu   = "0"
+  },
   # Bundle libtpu for Kaggle
   {
     git_tag         = "v2.1.0"
 
@@ -244,9 +244,10 @@ def bazel_build(self, ext):
 
     bazel_argv = [
         'bazel', 'build', ext.bazel_target,
-        f"--symlink_prefix={os.path.join(self.build_temp, 'bazel-')}",
-        '\n'.join(['--cxxopt=%s' % opt for opt in extra_compile_args])
+        f"--symlink_prefix={os.path.join(self.build_temp, 'bazel-')}"
     ]
+    for opt in extra_compile_args:
+      bazel_argv.append("--cxxopt={}".format(opt))
 
     # Debug build.
     if DEBUG:
 
@@ -78,9 +78,9 @@ ptxla_cc_test(
         ":torch_xla_test",
         "//torch_xla/csrc/runtime:runtime",
         "//torch_xla/csrc/runtime:debug_macros",
-        "//torch_xla/csrc/runtime:multi_wait",
-        "//torch_xla/csrc/runtime:thread_pool",
         "//torch_xla/csrc:tensor",
+        "//torch_xla/csrc:thread_pool",
+        "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
         "@xla//xla:shape_util",
         "@xla//xla/client:xla_builder",
@@ -101,15 +101,16 @@ ptxla_cc_test(
     ],
 )
 
-ptxla_cc_test(
-    name = "test_xla_backend_intf",
-    srcs = ["test_xla_backend_intf.cpp"],
-    deps = [
-        ":cpp_test_util",
-        "//torch_xla/csrc:tensor",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
+# Disable this test since it is flaky on upstream
+# ptxla_cc_test(
+#     name = "test_xla_backend_intf",
+#     srcs = ["test_xla_backend_intf.cpp"],
+#     deps = [
+#         ":cpp_test_util",
+#         "//torch_xla/csrc:tensor",
+#         "@com_google_googletest//:gtest_main",
+#     ],
+# )
 
 ptxla_cc_test(
     name = "test_xla_sharding",
 
@@ -307,7 +307,7 @@ std::vector<at::Tensor> Fetch(
   std::vector<at::Tensor> tensors;
   for (auto& literal : literals) {
     tensors.push_back(MakeTensorFromXlaLiteral(
-        literal, TensorTypeFromXlaType(literal.shape().element_type())));
+        literal, MaybeUpcastToHostTorchType(literal.shape().element_type())));
   }
   return tensors;
 }
Original file line number	Diff line number	Diff line change
`@@ -307,7 +307,7 @@ std::vector<at::Tensor> Fetch(`
`307`	`307`	`std::vector<at::Tensor> tensors;`
`308`	`308`	`for (auto& literal : literals) {`
`309`	`309`	`tensors.push_back(MakeTensorFromXlaLiteral(`
`310`		`- literal, TensorTypeFromXlaType(literal.shape().element_type())));`
	`310`	`+ literal, MaybeUpcastToHostTorchType(literal.shape().element_type())));`
`311`	`311`	`}`
`312`	`312`	`return tensors;`
`313`	`313`	`}`