Skip to content

Commit 56e57f0

Browse files
committed
Merge remote-tracking branch 'origin' into benchmark
2 parents a6bef63 + 57e6035 commit 56e57f0

File tree

105 files changed

+7864
-2835
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+7864
-2835
lines changed

README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,11 @@ bucket.
147147

148148
| Version | Cloud TPU VMs Wheel |
149149
| --- | ----------- |
150-
| 2.1 (CUDA 12.0 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.0/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
150+
| 2.1 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
151151
| 2.1 (XRT + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/xrt/tpuvm/torch_xla-2.1.0%2Bxrt-cp310-cp310-manylinux_2_28_x86_64.whl` |
152152
| nightly (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
153153
| nightly (Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl` |
154+
| nightly (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
154155

155156
<details>
156157

@@ -230,11 +231,11 @@ This is only required on Cloud TPU VMs.
230231

231232
<br/>
232233

233-
| Version | GPU CUDA 12.0 Docker |
234+
| Version | GPU CUDA 12.1 Docker |
234235
| --- | ----------- |
235-
| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_12.0` |
236-
| nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.0` |
237-
| nightly at date | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.0_YYYYMMDD` |
236+
| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_12.1` |
237+
| nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1` |
238+
| nightly at date | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1_YYYYMMDD` |
238239

239240
<br/>
240241

codegen/lazy_tensor_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def build_ir_node(self, func: NativeFunction, schema: LazyIrSchema) -> str:
115115
get_tensorlist="GetTensorList",
116116
get_tensor_or_wrap_number="bridge::GetXlaTensorOrCreateForWrappedNumber",
117117
try_get_tensor="bridge::TryGetXlaTensor",
118-
metrics_counter='TORCH_LAZY_FN_COUNTER("xla::")',
118+
metrics_counter='TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::")',
119119
create_tensor="XLATensor::Create",
120120
create_aten_from_ltc_tensor="torch_xla::bridge::AtenFromXlaTensor",
121121
tuple_aten_from_ltc_tensors="torch_xla::bridge::TupleAtenFromXlaTensors",

codegen/xla_native_functions.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ supported:
192192
- floor_divide
193193
- fmod.Scalar
194194
- fmod.Tensor
195+
- full
195196
- gather
196197
- gelu
197198
- gelu_backward

docs/gpu.md

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ You can either use a local machine with GPU attached or a GPU VM on the cloud. F
1111
### Docker
1212
Pytorch/XLA currently publish prebuilt docker images and wheels with cuda11.7/8 and python 3.8. We recommend users to create a docker container with corresponding config. For a full list of docker images and wheels, please refer to [this doc](https://github.com/pytorch/xla#available-docker-images-and-wheels).
1313
```
14-
sudo docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_11.8
14+
sudo docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1
1515
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent software-properties-common
1616
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
1717
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
1818
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
1919
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
2020
sudo systemctl restart docker
21-
sudo docker run --shm-size=16g --net=host --gpus all -it -d us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_11.8 bin/bash
21+
sudo docker run --shm-size=16g --net=host --gpus all -it -d us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1 bin/bash
2222
sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash
2323
```
2424

@@ -49,10 +49,20 @@ Thu Dec 8 06:24:29 2022
4949
5050
```
5151

52+
### Check environment variable
53+
54+
Make sure `PATH` and `LD_LIBRARY_PATH` environment variables account for cuda. Please do a `echo $PATH` and `echo $LD_LIBRARY_PATH` to verify. If not, please follow [link](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#mandatory-actions) to do so. Example:
55+
56+
```
57+
echo "export PATH=/usr/local/cuda-12.1/bin${PATH:+:${PATH}}" >> ~/.bashrc
58+
echo "export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> ~/.bashrc
59+
source ~/.bashrc
60+
```
61+
5262
### Wheel
5363
```
54-
pip3 install torch==2.0
55-
pip3 install https://storage.googleapis.com/tpu-pytorch/wheels/cuda/117/torch_xla-2.0-cp38-cp38-linux_x86_64.whl
64+
pip3 install torch==2.1
65+
pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl
5666
```
5767

5868
## Run a simple model
@@ -80,23 +90,25 @@ AMP is very useful on GPU training and PyTorch/XLA reuse Cuda's AMP rule. You ca
8090
1. Inside a GPU VM, create a docker container from a development docker image. For example:
8191

8292
```
83-
sudo docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_11.8
93+
sudo docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1
8494
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent software-properties-common
8595
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
8696
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
8797
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
8898
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
8999
sudo systemctl restart docker
90-
sudo docker run --shm-size=16g --net=host --gpus all -it -d us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_11.8
100+
sudo docker run --shm-size=16g --net=host --gpus all -it -d us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1
91101
sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash
92102
```
93103

94104
2. Build PyTorch and PyTorch/XLA from source.
95105

106+
Make sure `PATH` and `LD_LIBRARY_PATH` environment variables account for cuda. See the [above](https://github.com/pytorch/xla/blob/master/docs/gpu.md#check-environment-variable) for more info.
107+
96108
```
97109
git clone https://github.com/pytorch/pytorch.git
98110
cd pytorch
99-
USE_CUDA=0 python setup.py install
111+
USE_CUDA=1 python setup.py install
100112
101113
git clone https://github.com/pytorch/xla.git
102114
cd xla

docs/spmd.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Also, this version of the SPMD is currently only tested.optimized on Google Clou
3333

3434
### Simple Example & Sharding Aannotation API
3535

36-
Users can annotate native PyTorch tensors using the `mark_sharding` API ([src](https://github.com/pytorch/xla/blob/9a5fdf3920c18275cf7dba785193636f1b39ced9/torch_xla/experimental/xla_sharding.py#L388)). This takes `torch.Tensor` as input and returns a `XLAShardedTensor` as output.
36+
Users can annotate native PyTorch tensors using the `mark_sharding` API ([src](https://github.com/pytorch/xla/blob/4e8e5511555073ce8b6d1a436bf808c9333dcac6/torch_xla/distributed/spmd/xla_sharding.py#L452)). This takes `torch.Tensor` as input and returns a `XLAShardedTensor` as output.
3737

3838
```python
3939
def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh, partition_spec: Tuple[Union[int, None]]) -> XLAShardedTensor
@@ -46,8 +46,8 @@ import numpy as np
4646
import torch
4747
import torch_xla.core.xla_model as xm
4848
import torch_xla.runtime as xr
49-
import torch_xla.experimental.xla_sharding as xs
50-
from torch_xla.experimental.xla_sharding import Mesh
49+
import torch_xla.distributed.spmd as xs
50+
from torch_xla.distributed.spmd import Mesh
5151

5252
# Enable XLA SPMD execution mode.
5353
xr.use_spmd()
@@ -100,11 +100,11 @@ We derive a logical mesh based on this topology to create sub-groups of devices
100100

101101
![alt_text](assets/mesh_spmd2.png "image_tooltip")
102102

103-
We abstract logical mesh with [Mesh API](https://github.com/pytorch/xla/blob/028df4da388468fa9a41b1f98ea08bfce13b4c63/torch_xla/experimental/xla_sharding.py#L16). The axes of the logical Mesh can be named. Here is an example:
103+
We abstract logical mesh with [Mesh API](https://github.com/pytorch/xla/blob/4e8e5511555073ce8b6d1a436bf808c9333dcac6/torch_xla/distributed/spmd/xla_sharding.py#L17). The axes of the logical Mesh can be named. Here is an example:
104104

105105
```python
106106
import torch_xla.runtime as xr
107-
from torch_xla.experimental.xla_sharding import Mesh
107+
from torch_xla.distributed.spmd import Mesh
108108

109109
# Assuming you are running on a TPU host that has 8 devices attached
110110
num_devices = xr.global_runtime_device_count()
@@ -130,7 +130,7 @@ In general, SPMD programs should create a single mesh and reuse it for all shard
130130
Mesh nicely abstracts how the physical device mesh is constructed. Users can arrange devices in any shape and order using the logical mesh. However, one can define a more performant mesh based on the physical topology, especially when it involves Data Center Network (DCN) cross slice connections. HybridMesh creates a mesh which gives good performance out of the box for such multislice environments. It accepts ici\_mesh\_shape and dcn\_mesh\_shape which denote logical mesh shapes of inner and outer network.
131131

132132
```python
133-
from torch_xla.experimental.xla_sharding import HybridMesh
133+
from torch_xla.distributed.spmd import HybridMesh
134134

135135
# This example is assuming 2 slices of v4-8.
136136
# - ici_mesh_shape: shape of the logical mesh for inner connected devices.
@@ -198,7 +198,7 @@ The main use case for `XLAShardedTensor` [[RFC](https://github.com/pytorch/xla/i
198198
* `XLAShardedTensor` is a `torch.Tensor` subclass and works directly with native torch ops and `module.layers`. We use `__torch_dispatch__` to send `XLAShardedTensor` to the XLA backend. PyTorch/XLA retrieves attached sharding annotations to trace the graph and invokes XLA SPMDPartitioner.
199199
* Internally, `XLAShardedTensor` (and its global\_tensor input) is backed by `XLATensor` with a special data structure holding references to the sharded device data.
200200
* The sharded tensor after lazy execution may be gathered and materialized back to the host as global\_tensor when requested on the host (e.g., printing the value of the global tensor.
201-
* The handles to the local shards are materialized strictly after the lazy execution. `XLAShardedTensor` exposes [local\_shards](https://github.com/pytorch/xla/blob/909f28fa4c1a44efcd21051557b3bcf2d399620d/torch_xla/experimental/xla_sharded_tensor.py#L111) to return the local shards on addressable devices as <code>List[[XLAShard](https://github.com/pytorch/xla/blob/909f28fa4c1a44efcd21051557b3bcf2d399620d/torch_xla/experimental/xla_sharded_tensor.py#L12)]</code>.
201+
* The handles to the local shards are materialized strictly after the lazy execution. `XLAShardedTensor` exposes [local\_shards](https://github.com/pytorch/xla/blob/4e8e5511555073ce8b6d1a436bf808c9333dcac6/torch_xla/distributed/spmd/xla_sharded_tensor.py#L117) to return the local shards on addressable devices as <code>List[[XLAShard](https://github.com/pytorch/xla/blob/4e8e5511555073ce8b6d1a436bf808c9333dcac6/torch_xla/distributed/spmd/xla_sharded_tensor.py#L12)]</code>.
202202

203203
There is also an ongoing effort to integrate <code>XLAShardedTensor</code> into <code>DistributedTensor</code> API to support XLA backend [[RFC](https://github.com/pytorch/pytorch/issues/92909)].
204204

infra/ansible/config/env.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ release_env:
1616
cuda:
1717
TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0
1818
XLA_CUDA: 1
19+
PATH: /usr/local/cuda-{{ cuda_version }}/bin:/usr/local/nvidia/bin${PATH:+:${PATH}}
20+
LD_LIBRARY_PATH: /usr/local/cuda-{{ cuda_version }}/lib64:/usr/local/cuda-{{ cuda_version }}/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}:/usr/local/lib
1921

2022
# Variables that will be passed to shell environment only for building PyTorch and XLA libs.
2123
build_env:
@@ -44,6 +46,8 @@ build_env:
4446
cuda:
4547
TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0
4648
XLA_CUDA: 1
49+
PATH: /usr/local/cuda-{{ cuda_version }}/bin:/usr/local/nvidia/bin${PATH:+:${PATH}}
50+
LD_LIBRARY_PATH: /usr/local/cuda-{{ cuda_version }}/lib64:/usr/local/cuda-{{ cuda_version }}/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}:/usr/local/lib
4751

4852
tpu:
4953
ACCELERATOR: tpu

infra/tpu-pytorch-releases/artifacts.auto.tfvars

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,18 @@ nightly_package_version = "2.2.0"
33
# Built once a day from master.
44
nightly_builds = [
55
{ accelerator = "tpu" },
6+
{
7+
accelerator = "tpu"
8+
python_version = "3.9"
9+
},
610
{
711
accelerator = "tpu"
812
python_version = "3.10"
913
},
14+
{
15+
accelerator = "tpu"
16+
python_version = "3.11"
17+
},
1018
{
1119
accelerator = "cuda"
1220
cuda_version = "12.1"
@@ -73,6 +81,14 @@ versioned_builds = [
7381
python_version = "3.10"
7482
bundle_libtpu = "0"
7583
},
84+
{
85+
git_tag = "v2.1.0"
86+
pytorch_git_rev = "v2.1.0"
87+
package_version = "2.1.0"
88+
accelerator = "tpu"
89+
python_version = "3.11"
90+
bundle_libtpu = "0"
91+
},
7692
# Bundle libtpu for Kaggle
7793
{
7894
git_tag = "v2.1.0"

setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,10 @@ def bazel_build(self, ext):
244244

245245
bazel_argv = [
246246
'bazel', 'build', ext.bazel_target,
247-
f"--symlink_prefix={os.path.join(self.build_temp, 'bazel-')}",
248-
'\n'.join(['--cxxopt=%s' % opt for opt in extra_compile_args])
247+
f"--symlink_prefix={os.path.join(self.build_temp, 'bazel-')}"
249248
]
249+
for opt in extra_compile_args:
250+
bazel_argv.append("--cxxopt={}".format(opt))
250251

251252
# Debug build.
252253
if DEBUG:

test/cpp/BUILD

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,9 @@ ptxla_cc_test(
7878
":torch_xla_test",
7979
"//torch_xla/csrc/runtime:runtime",
8080
"//torch_xla/csrc/runtime:debug_macros",
81-
"//torch_xla/csrc/runtime:multi_wait",
82-
"//torch_xla/csrc/runtime:thread_pool",
8381
"//torch_xla/csrc:tensor",
82+
"//torch_xla/csrc:thread_pool",
83+
"@com_google_absl//absl/synchronization",
8484
"@com_google_googletest//:gtest_main",
8585
"@xla//xla:shape_util",
8686
"@xla//xla/client:xla_builder",
@@ -101,15 +101,16 @@ ptxla_cc_test(
101101
],
102102
)
103103

104-
ptxla_cc_test(
105-
name = "test_xla_backend_intf",
106-
srcs = ["test_xla_backend_intf.cpp"],
107-
deps = [
108-
":cpp_test_util",
109-
"//torch_xla/csrc:tensor",
110-
"@com_google_googletest//:gtest_main",
111-
],
112-
)
104+
# Disable this test since it is flaky on upstream
105+
# ptxla_cc_test(
106+
# name = "test_xla_backend_intf",
107+
# srcs = ["test_xla_backend_intf.cpp"],
108+
# deps = [
109+
# ":cpp_test_util",
110+
# "//torch_xla/csrc:tensor",
111+
# "@com_google_googletest//:gtest_main",
112+
# ],
113+
# )
113114

114115
ptxla_cc_test(
115116
name = "test_xla_sharding",

test/cpp/cpp_test_util.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ std::vector<at::Tensor> Fetch(
307307
std::vector<at::Tensor> tensors;
308308
for (auto& literal : literals) {
309309
tensors.push_back(MakeTensorFromXlaLiteral(
310-
literal, TensorTypeFromXlaType(literal.shape().element_type())));
310+
literal, MaybeUpcastToHostTorchType(literal.shape().element_type())));
311311
}
312312
return tensors;
313313
}

0 commit comments

Comments
 (0)