diff --git a/docker/experimental/ansible/.ansible-lint b/docker/experimental/ansible/.ansible-lint new file mode 100644 index 000000000000..a8661e612016 --- /dev/null +++ b/docker/experimental/ansible/.ansible-lint @@ -0,0 +1,6 @@ +--- +# .ansible-lint + +profile: moderate +skip_list: + - schema[tasks] \ No newline at end of file diff --git a/docker/experimental/ansible/README.md b/docker/experimental/ansible/README.md new file mode 100644 index 000000000000..9094f645de30 --- /dev/null +++ b/docker/experimental/ansible/README.md @@ -0,0 +1,58 @@ +# Ansible playbook + +This ansible playbook will perform the following actions on the localhost: + * install required pip and apt packages, depending on the specified stage, + architecture and accelerator (see [apt.yaml](config/apt.yaml) and + [pip.yaml](config/pip.yaml)). + * fetch bazel (version configured in [vars.yaml](config/vars.yaml)), + * fetch PyTorch and XLA sources at master (or specific revisions, + see role `fetch_srcs` in [playbook.yaml](playbook.yaml)). + * set required environment variables (see [env.yaml](config/env.yaml)), + * build and install PyTorch and XLA wheels, + * apply infrastructure tests (see `*/tests.yaml` files in [roles](roles)). + +## Prerequisites + +* Python 3.8+ +* Ansible. Install with `pip install ansible`. + +## Running + +The playbook requires passing explicitly 3 variables that configure playbook +behavior (installed pip/apt packages and set environment variables): +* `stage`: build or release. Different packages are installed depending on + the chosen stage. +* `arch`: aarch64 or amd64. Architecture of the built image and wheels. +* `accelerator`: tpu or cuda. Available accelerator. + +The variables can be passed through `-e` flag: `-e "="`. + +Example: `ansible-playbook playbook.yaml -e "stage=build arch=amd64 accelerator=tpu"` + +## Config structure + +The playbook configuration is split into 4 files, per each logical system. +The configuration is simply loaded as playbook variables which are then passed +to specific roles and tasks. +Only variables in [config/env.yaml](config/env.yaml) are passed as env variables. + +* [apt.yaml](config/apt.yaml) - specifies apt packages for each stage and + architecture or accelerator. + Packages shared between all architectures and accelerators in a given stage + are specified in `*_common`. They are appended to any architecture specific list. + + This config also contains a list of required apt repos and signing keys. + These variables are mainly consumed by the [install_deps](roles/install_deps/tasks/main.yaml) role. + +* [pip.yaml](config/pip.yaml) - similarly to apt.yaml, lists pip packages per stage and arch / accelerator. + In both pip and apt config files stage and and arch / accelerator are + concatenated together and specified under one key (e.g. build_amd64, release_tpu). + +* [env.yaml](config/env.yaml) - contains Ansible variables that are passed as env variables when + building PyTorch and XLA (`build_env`). Variables in `release_env` are saved in `/etc/environment` (executed for the `release` stage). + +* [vars.yaml](config/vars.yaml) - Ansible variables used in other config files and throughout the playbook. + Not associated with any particular system. + +Variables from these config files are dynamically loaded (during playbook execution), +see [playbook.yaml](playbook.yaml). diff --git a/docker/experimental/ansible/ansible.cfg b/docker/experimental/ansible/ansible.cfg new file mode 100644 index 000000000000..cb7519265802 --- /dev/null +++ b/docker/experimental/ansible/ansible.cfg @@ -0,0 +1,14 @@ +# See https://docs.ansible.com/ansible/latest/reference_appendices/config.html +# for various configuration options. + +[defaults] +# Displays tasks execution duration. +callbacks_enabled = profile_tasks +# The playbooks is only run on the implicit localhost. +# Silence warning about empty hosts inventory. +localhost_warning = False + +[inventory] +# Silence warning about no inventory. +# This option is available since Ansible 2.14 (available only with Python 3.9+). +inventory_unparsed_warning = False \ No newline at end of file diff --git a/docker/experimental/ansible/config/apt.yaml b/docker/experimental/ansible/config/apt.yaml new file mode 100644 index 000000000000..97ce1755f234 --- /dev/null +++ b/docker/experimental/ansible/config/apt.yaml @@ -0,0 +1,61 @@ +# Contains lists of apt packages for each stage (build|release) and arch or accelerator. +apt: + pkgs: + build_common: + - ccache + - curl + - git + - gnupg + - libopenblas-dev + - ninja-build + - procps + - python3-pip + - rename + - vim + - wget + + build_cuda: + - cuda-libraries-11-8 + - cuda-toolkit-11-8 + - cuda-minimal-build-11-8 + - libcudnn8=8.8.0.121-1+cuda11.8 + - libcudnn8-dev=8.8.0.121-1+cuda11.8 + + build_amd64: + - "clang-{{ clang_version }}" + + build_aarch64: + - scons + - gcc-10 + - g++-10 + + release_common: + - curl + - git + - gnupg + - google-cloud-cli + - libgomp1 + - libopenblas-base + - patch + + release_cuda: + - cuda-libraries-11-8 + - cuda-minimal-build-11-8 + - libcudnn8=8.8.0.121-1+cuda11.8 + + # Specify objects with string fields `url` and `keyring`. + # The keyring path should start with /usr/share/keyrings/ for debian and ubuntu. + signing_keys: + - url: https://apt.llvm.org/llvm-snapshot.gpg.key + keyring: /usr/share/keyrings/llvm.pgp + - url: https://packages.cloud.google.com/apt/doc/apt-key.gpg + keyring: /usr/share/keyrings/cloud.google.gpg + - url: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_repo }}/x86_64/3bf863cc.pub" + keyring: /usr/share/keyrings/cuda.pgp + + repos: + # signed-by path should match the corresponding keyring path above. + - "deb [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main" + - "deb-src [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main" + - "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" + - "deb [signed-by=/usr/share/keyrings/cuda.pgp] https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_repo }}/x86_64/ /" \ No newline at end of file diff --git a/docker/experimental/ansible/config/env.yaml b/docker/experimental/ansible/config/env.yaml new file mode 100644 index 000000000000..ce1e53d004cd --- /dev/null +++ b/docker/experimental/ansible/config/env.yaml @@ -0,0 +1,42 @@ +# Variables that will be stored in /etc/environment file for the release stage. +# They'll be accessible for all processes on the host. +release_env: + common: + CC: "clang-{{ clang_version }}" + CXX: "clang++-{{ clang_version }}" + LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib" + + tpu: + ACCELERATOR: tpu + TPUVM_MODE: 1 + + cuda: + TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0 + XLA_CUDA: 1 + +# Variables that will be passed to shell environment only for building PyTorch and XLA libs. +build_env: + common: + LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib" + # Set explicitly to 0 as setup.py defaults this flag to true if unset. + BUILD_CPP_TESTS: 0 + CC: "clang-{{ clang_version }}" + CXX: "clang++-{{ clang_version }}" + PYTORCH_BUILD_NUMBER: 1 + TORCH_XLA_VERSION: "{{ package_version }}" + PYTORCH_BUILD_VERSION: "{{ package_version }}" + XLA_SANDBOX_BUILD: 1 + + amd64: + ARCH: amd64 + + aarch64: + + cuda: + TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0 + XLA_CUDA: 1 + + tpu: + ACCELERATOR: tpu + TPUVM_MODE: 1 + diff --git a/docker/experimental/ansible/config/pip.yaml b/docker/experimental/ansible/config/pip.yaml new file mode 100644 index 000000000000..add0fb0e221d --- /dev/null +++ b/docker/experimental/ansible/config/pip.yaml @@ -0,0 +1,53 @@ +# Contains lists of pip packages for each stage (build|release) and arch or accelerator. +pip: + pkgs: + # Shared between all architectures and accelerators for the build stage. + build_common: + - astunparse + - cffi + - cloud-tpu-client + - cmake + - coverage + - dataclasses + - expecttest==0.1.3 + - future + - git-archive-all + - google-api-python-client + - google-cloud-storage + - hypothesis + - lark-parser + - ninja + - numpy + - oauth2client + - pyyaml + - requests + - setuptools + - six + - tensorboard + - tensorboardX + - tqdm + - typing + - typing_extensions + - sympy + + build_amd64: + - mkl + - mkl-include + + build_aarch64: + + # Shared between all architectures and accelerators for the release stage. + release_common: + - numpy + - pyyaml + - mkl + - mkl-include + + release_tpu: + - torch_xla[tpuvm] + + # Packages that will be installed with the `--nodeps` flag. + pkgs_nodeps: + release_common: + - torchvision + - pillow diff --git a/docker/experimental/ansible/config/vars.yaml b/docker/experimental/ansible/config/vars.yaml new file mode 100644 index 000000000000..4afb567aacb2 --- /dev/null +++ b/docker/experimental/ansible/config/vars.yaml @@ -0,0 +1,7 @@ +# Used for fetching cuda from the right repo, see apt.yaml. +cuda_repo: ubuntu1804 +# Used for fetching clang from the right repo, see apt.yaml. +llvm_debian_repo: buster +clang_version: 10 +# PyTorch and PyTorch/XLA wheel versions. +package_version: 2.0 \ No newline at end of file diff --git a/docker/experimental/ansible/playbook.yaml b/docker/experimental/ansible/playbook.yaml new file mode 100644 index 000000000000..7d89fc335383 --- /dev/null +++ b/docker/experimental/ansible/playbook.yaml @@ -0,0 +1,88 @@ +- name: "Install build dependencies" + hosts: localhost + connection: local + + # The playbook requires passing 3 variables explicitly: + # - stage: build or release. Different packages are installed depending on + # the chosen stage. + # - arch: aarch64 or amd64. Architecture of the built image and wheels. + # - accelerator: tpu or cuda. Available accelerator. + pre_tasks: + - name: "Validate required variables" + ansible.builtin.assert: + that: "{{ lookup('ansible.builtin.vars', item.name) is regex(item.pattern) }}" + fail_msg: | + "Variable '{{ item.name }}' doesn't match pattern '{{ item.pattern }}'" + "Pass the required variable with: --e \"{{ item.name }}=\"" + loop: + - name: stage + pattern: ^(build|release)$ + - name: arch + pattern: ^(aarch64|amd64)$ + - name: accelerator + pattern: ^(tpu|cuda)$ + + - name: "Include vars from config files" + ansible.builtin.include_vars: + file: "config/{{ item }}" + loop: + # vars.yaml should be the first as other config files depend on it. + - vars.yaml + - apt.yaml + - pip.yaml + - env.yaml + + roles: + - bazel + + - role: install_deps + vars: + apt_keys: "{{ apt.signing_keys }}" + + # If a variable (like `apt.pkgs.common`) is defined, but not set to + # anything it cannot be concatenated with a list. + # Use `v | default([], true)` to set `v` to an empty array if it evaluates to false. + # See https://jinja.palletsprojects.com/en/3.0.x/templates/#jinja-filters.default. + apt_pkgs: "{{ + apt.pkgs[stage + '_common'] | default([], true) + + apt.pkgs[stage + '_' + arch] | default([], true) + + apt.pkgs[stage + '_' + accelerator] | default([], true) + }}" + + apt_repos: "{{ apt.repos }}" + + pip_pkgs: "{{ + pip.pkgs[stage + '_common'] | default([], true) + + pip.pkgs[stage + '_' + arch] | default([], true) + + pip.pkgs[stage + '_' + accelerator] | default([], true) + }}" + + pip_pkgs_nodeps: "{{ + pip.pkgs_nodeps[stage + '_common'] | default([], true) + + pip.pkgs_nodeps[stage + '_' + arch] | default([], true) + + pip.pkgs_nodeps[stage + '_' + accelerator] | default([], true) + }}" + + - role: fetch_srcs + vars: + src_root: "/src" + pytorch_git_rev: HEAD + xla_git_rev: HEAD + + - role: build_srcs + vars: + src_root: "/src" + env_vars: "{{ + build_env.common | default({}, true) | + combine(build_env[arch] | default({}, true)) | + combine(build_env[accelerator] | default({}, true)) + }}" + + - role: configure_env + vars: + env_vars: "{{ + release_env.common | default({}, true) | + combine(release_env[arch] | default({}, true)) | + combine(release_env[accelerator] | default({}, true)) + }}" + when: stage == "release" diff --git a/docker/experimental/ansible/roles/bazel/defaults/main.yaml b/docker/experimental/ansible/roles/bazel/defaults/main.yaml new file mode 100644 index 000000000000..6ddadb8b6863 --- /dev/null +++ b/docker/experimental/ansible/roles/bazel/defaults/main.yaml @@ -0,0 +1 @@ +bazelisk_version: 1.15.0 diff --git a/docker/experimental/ansible/roles/bazel/tasks/main.yaml b/docker/experimental/ansible/roles/bazel/tasks/main.yaml new file mode 100644 index 000000000000..038a5a1cefa5 --- /dev/null +++ b/docker/experimental/ansible/roles/bazel/tasks/main.yaml @@ -0,0 +1,10 @@ +- name: "Download bazelisk v{{ bazelisk_version }}" + ansible.builtin.get_url: + url: "https://github.com/bazelbuild/bazelisk/releases/download/v{{ bazelisk_version }}/bazelisk-linux-amd64" + dest: /usr/local/bin/bazel + mode: 'u=rxw,g=rw,o=r' + +- name: "Tests" + include_tasks: tests.yaml + tags: + - tests diff --git a/docker/experimental/ansible/roles/bazel/tasks/tests.yaml b/docker/experimental/ansible/roles/bazel/tasks/tests.yaml new file mode 100644 index 000000000000..4cb19f772109 --- /dev/null +++ b/docker/experimental/ansible/roles/bazel/tasks/tests.yaml @@ -0,0 +1,3 @@ +- name: "Bazel --version runs succesfully" + ansible.builtin.command: + cmd: bazel --version diff --git a/docker/experimental/ansible/roles/build_srcs/tasks/main.yaml b/docker/experimental/ansible/roles/build_srcs/tasks/main.yaml new file mode 100644 index 000000000000..667ee43f9219 --- /dev/null +++ b/docker/experimental/ansible/roles/build_srcs/tasks/main.yaml @@ -0,0 +1,46 @@ +- name: Build PyTorch + ansible.builtin.command: + cmd: python setup.py bdist_wheel + chdir: "{{ (src_root, 'pytorch') | path_join }}" + creates: "{{ (src_root, 'pytorch/dist/*.whl') | path_join }}" + # Set `USE_CUDA=0` as PyTorch cannot be used with GPU in eager and XLA mode. + environment: "{{ env_vars | combine({'USE_CUDA': 0}) }}" + +- name: Find PyTorch *.whl files in pytorch/dist + ansible.builtin.find: + path: "{{ (src_root, 'pytorch/dist') | path_join }}" + pattern: "*.whl" + register: pytorch_wheels + +- name: Install PyTorch wheels + ansible.builtin.pip: + name: "{{ pytorch_wheels.files | map(attribute='path') }}" + state: "forcereinstall" + +- name: Build XLA computation client library + ansible.builtin.command: + cmd: bash build_torch_xla_libs.sh -O -D_GLIBCXX_USE_CXX11_ABI=1 + chdir: "{{ (src_root, 'pytorch/xla') | path_join }}" + environment: "{{ env_vars }}" + +- name: Build PyTorch/XLA + ansible.builtin.command: + cmd: python setup.py bdist_wheel + chdir: "{{ (src_root, 'pytorch/xla') | path_join }}" + environment: "{{ env_vars }}" + +- name: Find XLA *.whl files in pytorch/xla/dist + ansible.builtin.find: + path: "{{ (src_root, 'pytorch/xla/dist') | path_join }}" + pattern: "*.whl" + register: xla_wheels + +- name: Install XLA wheels + ansible.builtin.pip: + name: "{{ xla_wheels.files | map(attribute='path') }}" + state: "forcereinstall" + +- name: "Tests" + include_tasks: tests.yaml + tags: + - tests diff --git a/docker/experimental/ansible/roles/build_srcs/tasks/tests.yaml b/docker/experimental/ansible/roles/build_srcs/tasks/tests.yaml new file mode 100644 index 000000000000..9e925700ddeb --- /dev/null +++ b/docker/experimental/ansible/roles/build_srcs/tasks/tests.yaml @@ -0,0 +1,9 @@ +- name: "Check that various import statements work" + ansible.builtin.command: + cmd: "{{ item }}" + environment: "{{ env_vars | combine({'USE_CUDA': 0}) }}" + loop: + - python -c "import torchgen" + - python -c "import torch" + - python -c "import torch_xla" + - python -c "import torch_xla.core.xla_model" diff --git a/docker/experimental/ansible/roles/configure_env/tasks/main.yaml b/docker/experimental/ansible/roles/configure_env/tasks/main.yaml new file mode 100644 index 000000000000..807912019d56 --- /dev/null +++ b/docker/experimental/ansible/roles/configure_env/tasks/main.yaml @@ -0,0 +1,13 @@ +- name: Append environment variables required during runtime to ~/.bashrc + ansible.builtin.lineinfile: + path: ~/.bashrc + line: "export {{ item }}={{ env_vars[item] }}" + create: true + loop: "{{ env_vars.keys() | list }}" + +- name: Append environment variables required during runtime to ~/.zshrc + ansible.builtin.lineinfile: + path: ~/.zshrc + line: "export {{ item }}={{ env_vars[item] }}" + create: true + loop: "{{ env_vars.keys() | list }}" diff --git a/docker/experimental/ansible/roles/fetch_srcs/defaults/main.yaml b/docker/experimental/ansible/roles/fetch_srcs/defaults/main.yaml new file mode 100644 index 000000000000..54b409da6a22 --- /dev/null +++ b/docker/experimental/ansible/roles/fetch_srcs/defaults/main.yaml @@ -0,0 +1,3 @@ +# See https://docs.ansible.com/ansible/latest/collections/ansible/builtin/git_module.html#parameter-version +pytorch_git_rev: HEAD +xla_git_rev: HEAD diff --git a/docker/experimental/ansible/roles/fetch_srcs/tasks/main.yaml b/docker/experimental/ansible/roles/fetch_srcs/tasks/main.yaml new file mode 100644 index 000000000000..929a2404ac65 --- /dev/null +++ b/docker/experimental/ansible/roles/fetch_srcs/tasks/main.yaml @@ -0,0 +1,44 @@ +- name: "Create source root directory at {{ src_root }}" + ansible.builtin.file: + path: "{{ src_root }}" + state: directory + mode: '0755' + +- name: "Clone git PyTorch and XLA git repos" + ansible.builtin.git: + repo: "{{ item.repo }}" + dest: "{{ item.dest }}" + version: "{{ item.version }}" + depth: 1 + force: true + loop: + - repo: https://github.com/pytorch/pytorch + dest: "{{ (src_root, 'pytorch') | path_join }}" + version: "{{ pytorch_git_rev }}" + + - repo: https://github.com/pytorch/xla + dest: "{{ (src_root, 'pytorch/xla') | path_join }}" + version: "{{ xla_git_rev }}" + +- name: Find *.diff files in pytorch/xla/tf_patches + ansible.builtin.find: + path: "{{ (src_root, 'pytorch/xla/tf_patches') | path_join }}" + pattern: "*.diff" + register: tf_patches + +- name: Apply patches to Tensorflow + ansible.posix.patch: + src: "{{ item }}" + # Use source file on the target machine instead of the one where + # the playbook is located. Has no effect when the target machine is + # localhost. + remote_src: true + strip: 1 + basedir: "{{ (src_root, 'pytorch/xla/third_party/tensorflow') | path_join }}" + loop: "{{ tf_patches.files | map(attribute='path') }}" + ignore_errors: true + +- name: "Tests" + include_tasks: tests.yaml + tags: + - tests diff --git a/docker/experimental/ansible/roles/fetch_srcs/tasks/tests.yaml b/docker/experimental/ansible/roles/fetch_srcs/tasks/tests.yaml new file mode 100644 index 000000000000..f3c341197dc3 --- /dev/null +++ b/docker/experimental/ansible/roles/fetch_srcs/tasks/tests.yaml @@ -0,0 +1,13 @@ +- name: Retrieve status of setup.py files in XLA and PyTorch repos + ansible.builtin.stat: + path: "{{ item }}" + register: _res + loop: + - "{{ (src_root, 'pytorch/setup.py') | path_join }}" + - "{{ (src_root, 'pytorch/xla/setup.py') | path_join }}" + +- name: Assert that setup.py files exist + ansible.builtin.assert: + that: "{{ item.stat.exists }}" + fail_msg: "{{ item.item }} doesn't exist" + loop: "{{ _res.results }}" diff --git a/docker/experimental/ansible/roles/install_deps/tasks/main.yaml b/docker/experimental/ansible/roles/install_deps/tasks/main.yaml new file mode 100644 index 000000000000..5782b3544c20 --- /dev/null +++ b/docker/experimental/ansible/roles/install_deps/tasks/main.yaml @@ -0,0 +1,35 @@ +- name: Add apt keys + # Don't use apt-key for adding repo keys since it's deprecated. + # Instead place gpg and pgp files in /usr/share/keyrings/ (debian, ubuntu). + ansible.builtin.get_url: + url: "{{ item.url }}" + dest: "{{ item.keyring }}" + mode: 'u=rw,g=r,o=r' + loop: "{{ apt_keys }}" + register: add_apt_key + +- name: Add apt repositories into sources list + ansible.builtin.apt_repository: + repo: "{{ item }}" + state: present + loop: "{{ apt_repos }}" + register: add_apt_repo + +- name: Update apt cache + apt: + update_cache: true + when: add_apt_key.changed or add_apt_repo.changed + +- name: Install apt packages + ansible.builtin.apt: + name: "{{ apt_pkgs }}" + update_cache: true + +- name: Install pip packages + ansible.builtin.pip: + name: "{{ pip_pkgs }}" + +- name: Install pip packages without deps (--no-deps) + ansible.builtin.pip: + name: "{{ pip_pkgs_nodeps }}" + extra_args: "--no-deps"