From 2a0af54b2923255ccb623d7f4513807e985192f5 Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Wed, 14 Dec 2022 17:25:19 +0000 Subject: [PATCH 1/9] Add more options to support release builds. --- docker/experimental/cloudbuild.yaml | 5 +- docker/experimental/terraform/main.tf | 40 +++++++++++++++ .../terraform/modules/trigger/main.tf | 51 ++++++++++++++++--- 3 files changed, 87 insertions(+), 9 deletions(-) diff --git a/docker/experimental/cloudbuild.yaml b/docker/experimental/cloudbuild.yaml index cfe8f4deba8e..e7c8b8330b14 100644 --- a/docker/experimental/cloudbuild.yaml +++ b/docker/experimental/cloudbuild.yaml @@ -127,7 +127,10 @@ substitutions: _CACHE_TTL: '18h' _BUILD_ARGS: tpuvm=1,cuda=0 options: - machineType: E2_HIGHCPU_32 + # To run in a test project, either point to your pool or replace this with + # `machineType`. You may have to reduce _BAZEL_JOBS. + workerPool: projects/tpu-pytorch/locations/us-central1/wheel_build + # machineType: E2_HIGHCPU_32 dynamic_substitutions: true substitution_option: 'ALLOW_LOOSE' timeout: 24000s diff --git a/docker/experimental/terraform/main.tf b/docker/experimental/terraform/main.tf index 148c68868018..9a75acb7b5c3 100644 --- a/docker/experimental/terraform/main.tf +++ b/docker/experimental/terraform/main.tf @@ -37,6 +37,29 @@ resource "google_artifact_registry_repository" "torch-xla-docker-repo" { format = "DOCKER" } +resource "google_cloudbuild_worker_pool" "gcb-pool" { + name = "wheel_build" + location = "us-central1" + + worker_config { + disk_size_gb = 500 + machine_type = "e2-standard-32" + no_external_ip = false + } +} + +resource "google_service_account" "cloud-build-trigger-scheduler" { + account_id = "cloud-build-trigger-scheduler" + display_name = "Cloud Build Trigger Scheduler" + description = "Service account for running Cloud Build triggers in a Cloud Scheduler job" +} + +resource "google_project_iam_member" "cloud-build-scheduler-permission" { + project = google_service_account.cloud-build-trigger-scheduler.project + role = "roles/cloudbuild.builds.editor" + member = "serviceAccount:${google_service_account.cloud-build-trigger-scheduler.email}" +} + module "nightly-py37-tpuvm" { source = "./modules/trigger" @@ -44,6 +67,7 @@ module "nightly-py37-tpuvm" { python_version = "3.7" platform = "tpuvm" docker_build_args = [ "tpuvm=1" ] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email } module "nightly-py38-tpuvm" { @@ -53,6 +77,7 @@ module "nightly-py38-tpuvm" { python_version = "3.8" platform = "tpuvm" docker_build_args = [ "tpuvm=1" ] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email } module "nightly-py38-tpunode" { @@ -62,6 +87,7 @@ module "nightly-py38-tpunode" { python_version = "3.8" platform = "tpunode" docker_build_args = [ "tpuvm=0" ] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email } module "nightly-py38-cuda112" { @@ -71,4 +97,18 @@ module "nightly-py38-cuda112" { python_version = "3.8" platform = "cuda112" docker_build_args = [ "tpuvm=0,cuda=1"] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email +} + +module "r113-py37-tpuvm" { + source = "./modules/trigger" + + release = "1.13" + branch = "r1.13" + build_on_push = true + schedule = null + python_version = "3.7" + platform = "tpuvm" + docker_build_args = [ "tpuvm=1" ] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email } diff --git a/docker/experimental/terraform/modules/trigger/main.tf b/docker/experimental/terraform/modules/trigger/main.tf index 9b8b5dbf82ca..c8c20de0a343 100644 --- a/docker/experimental/terraform/modules/trigger/main.tf +++ b/docker/experimental/terraform/modules/trigger/main.tf @@ -1,7 +1,14 @@ +data "google_project" "project" { } + variable "release" { type = string } +variable "branch" { + type = string + default = "master" +} + variable "python_version" { description = "Python version to use (e.g. 3.8)" type = string @@ -16,25 +23,53 @@ variable "docker_build_args" { default = [ "tpuvm=1" ] } +variable "schedule" { + type = string + # Format: https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules + default = "0 0 * * *" +} + +variable "scheduler_service_account" { + type = string + default = null +} + +variable "build_on_push" { + type = string + default = false +} + locals { - trigger_name = format("pytorch-xla-%s-py%s-%s", var.release, replace(var.python_version, ".", ""), var.platform) + trigger_name = format("pytorch-xla-%s-py%s-%s", replace(var.release, ".", "-"), replace(var.python_version, ".", ""), var.platform) } resource "google_cloudbuild_trigger" "build-trigger" { location = "global" name = local.trigger_name + dynamic "github" { + # HACK: `source_to_build` is mutually exclusive with `github` + for_each = var.build_on_push ? [1] : [] + + content { + owner = "pytorch" + name = "xla" + push { + # `branch` is treated as a regex, so look for exact match + branch = "^${var.branch}$" + } + } + } + source_to_build { uri = "https://github.com/pytorch/xla" repo_type = "GITHUB" - # TODO: make branch configurable - ref = "refs/heads/master" + ref = "refs/heads/${var.branch}" } git_file_source { path = "docker/experimental/cloudbuild.yaml" repo_type = "GITHUB" - # TODO: make branch configurable revision = "refs/heads/master" uri = "https://github.com/pytorch/xla" } @@ -47,11 +82,12 @@ resource "google_cloudbuild_trigger" "build-trigger" { } resource "google_cloud_scheduler_job" "trigger-schedule" { + count = var.schedule != null ? 1 : 0 + name = format("%s-schedule", local.trigger_name) region = "us-central1" - # Format: https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules - schedule = "0 0 * * *" + schedule = var.schedule time_zone = "America/Los_Angeles" http_target { @@ -59,8 +95,7 @@ resource "google_cloud_scheduler_job" "trigger-schedule" { uri = "https://cloudbuild.googleapis.com/v1/projects/${google_cloudbuild_trigger.build-trigger.project}/triggers/${google_cloudbuild_trigger.build-trigger.trigger_id}:run" oauth_token { - # TODO: Include this SA in config - service_account_email = "cloud-build-trigger-scheduler@tpu-pytorch.iam.gserviceaccount.com" + service_account_email = var.scheduler_service_account } } } From 339aad724ff8884ba99101eaae51f5a0906092e6 Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Wed, 21 Dec 2022 22:12:01 +0000 Subject: [PATCH 2/9] FIx 1.13 GCB trigger --- docker/experimental/terraform/main.tf | 2 +- docker/experimental/terraform/modules/trigger/main.tf | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/docker/experimental/terraform/main.tf b/docker/experimental/terraform/main.tf index 9a75acb7b5c3..7d7db9345dba 100644 --- a/docker/experimental/terraform/main.tf +++ b/docker/experimental/terraform/main.tf @@ -104,7 +104,7 @@ module "r113-py37-tpuvm" { source = "./modules/trigger" release = "1.13" - branch = "r1.13" + branch = "wcromar/r1.13-kaggle" build_on_push = true schedule = null python_version = "3.7" diff --git a/docker/experimental/terraform/modules/trigger/main.tf b/docker/experimental/terraform/modules/trigger/main.tf index c8c20de0a343..92792c3d8793 100644 --- a/docker/experimental/terraform/modules/trigger/main.tf +++ b/docker/experimental/terraform/modules/trigger/main.tf @@ -70,8 +70,6 @@ resource "google_cloudbuild_trigger" "build-trigger" { git_file_source { path = "docker/experimental/cloudbuild.yaml" repo_type = "GITHUB" - revision = "refs/heads/master" - uri = "https://github.com/pytorch/xla" } substitutions = { From b04639ddb3678e2a403b1da0438ebf57004082e1 Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Wed, 21 Dec 2022 22:18:50 +0000 Subject: [PATCH 3/9] Fix torch pins and patches --- docker/experimental/Dockerfile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docker/experimental/Dockerfile b/docker/experimental/Dockerfile index c10286623ab5..4db465bb40aa 100644 --- a/docker/experimental/Dockerfile +++ b/docker/experimental/Dockerfile @@ -10,7 +10,7 @@ ARG tf_cuda_compute_capabilities="7.0,7.5,8.0" ARG tpuvm=1 ARG build_cpp_tests=0 -ARG package_version=1.14.0 +ARG package_version=2.0.0 ARG bazel_jobs= @@ -43,10 +43,13 @@ ENV CXX=clang++-8 RUN pip install mkl mkl-include setuptools typing_extensions cmake requests -RUN git clone --recursive --depth=1 https://github.com/pytorch/pytorch.git +RUN git clone --depth=1 https://github.com/pytorch/pytorch.git WORKDIR /pytorch COPY torch_patches/ torch_patches/ -RUN bash -c "torch_pin=$(cat torch_patches/.torch_pin &); git fetch origin ${torch_pin:-master}; git checkout origin/${torch_pin:-master}" +RUN bash -c 'torch_pin=$(cat torch_patches/.torch_pin &); git fetch origin ${torch_pin:-master}; git checkout FETCH_HEAD' +RUN git submodule update --init --recursive +RUN for p in torch_patches/*.diff; do patch -N -p1 < $p; done + # Disable CUDA for PyTorch ENV USE_CUDA "0" From 099f80fe78277ef9e03c6e34d1819b612a264848 Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Wed, 21 Dec 2022 22:28:56 +0000 Subject: [PATCH 4/9] Remove `git_file_source` entirely --- docker/experimental/terraform/modules/trigger/main.tf | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docker/experimental/terraform/modules/trigger/main.tf b/docker/experimental/terraform/modules/trigger/main.tf index 92792c3d8793..5b21c31c19ac 100644 --- a/docker/experimental/terraform/modules/trigger/main.tf +++ b/docker/experimental/terraform/modules/trigger/main.tf @@ -46,6 +46,7 @@ locals { resource "google_cloudbuild_trigger" "build-trigger" { location = "global" name = local.trigger_name + filename = "docker/experimental/cloudbuild.yaml" dynamic "github" { # HACK: `source_to_build` is mutually exclusive with `github` @@ -67,11 +68,6 @@ resource "google_cloudbuild_trigger" "build-trigger" { ref = "refs/heads/${var.branch}" } - git_file_source { - path = "docker/experimental/cloudbuild.yaml" - repo_type = "GITHUB" - } - substitutions = { _PLATFORM = var.platform _BUILD_ARGS = join(",", var.docker_build_args) From 6fdadd48c120275cc7b1b078a9bfbb4aa1482495 Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Thu, 22 Dec 2022 18:04:59 +0000 Subject: [PATCH 5/9] Set the correct release tag --- docker/experimental/terraform/modules/trigger/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/experimental/terraform/modules/trigger/main.tf b/docker/experimental/terraform/modules/trigger/main.tf index 5b21c31c19ac..717567e852c1 100644 --- a/docker/experimental/terraform/modules/trigger/main.tf +++ b/docker/experimental/terraform/modules/trigger/main.tf @@ -69,6 +69,7 @@ resource "google_cloudbuild_trigger" "build-trigger" { } substitutions = { + _RELEASE_VERSION = var.release _PLATFORM = var.platform _BUILD_ARGS = join(",", var.docker_build_args) _PYTHON_VERSION = var.python_version From dba540f6f574b785c35f223c6768a797c16736fc Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Thu, 29 Dec 2022 17:36:17 +0000 Subject: [PATCH 6/9] Fix for empty patches --- docker/experimental/Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docker/experimental/Dockerfile b/docker/experimental/Dockerfile index 4db465bb40aa..80f32e7d8633 100644 --- a/docker/experimental/Dockerfile +++ b/docker/experimental/Dockerfile @@ -48,7 +48,7 @@ WORKDIR /pytorch COPY torch_patches/ torch_patches/ RUN bash -c 'torch_pin=$(cat torch_patches/.torch_pin &); git fetch origin ${torch_pin:-master}; git checkout FETCH_HEAD' RUN git submodule update --init --recursive -RUN for p in torch_patches/*.diff; do patch -N -p1 < $p; done +RUN find -wholename 'torch_patches/*.diff' | xargs -r patch -N -p1 -i # Disable CUDA for PyTorch @@ -71,8 +71,7 @@ FROM builder AS artifacts COPY tf_patches/ tf_patches/ COPY third_party/ third_party/ - -RUN for p in tf_patches/*.diff; do patch -d third_party/tensorflow -N -p1 < $p; done +RUN find -wholename 'tf_patches/*.diff' | xargs -r patch -d third_party/tensorflow -N -p1 -i COPY build_torch_xla_libs.sh . From 9d39dd9e2785e9e81d76ee542754615a0ae1554f Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Thu, 29 Dec 2022 17:36:30 +0000 Subject: [PATCH 7/9] Go back to Docker build --- docker/experimental/cloudbuild.yaml | 45 ++++++++++++++--------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/docker/experimental/cloudbuild.yaml b/docker/experimental/cloudbuild.yaml index e7c8b8330b14..4b6a6ab47491 100644 --- a/docker/experimental/cloudbuild.yaml +++ b/docker/experimental/cloudbuild.yaml @@ -18,9 +18,6 @@ steps: args=(${_BUILD_ARGS//,/ }) flags=( - --cache=${_CACHE} - --cache-ttl=${_CACHE_TTL} - --cache-repo=${_CACHE_IMAGE_URL} --build-arg=bazel_jobs=${_BAZEL_JOBS} --build-arg=python_version=${_PYTHON_VERSION} # TODO: use current version in setup.py @@ -31,31 +28,27 @@ steps: - name: docker path: /docker - id: 'build-wheels' - name: 'gcr.io/kaniko-project/executor:debug' + name: 'gcr.io/cloud-builders/docker' entrypoint: sh args: - -cx - | - /kaniko/executor \ - --dockerfile=docker/experimental/Dockerfile \ - --destination=${_ARTIFACTS_IMAGE_URL} \ - --tar-path=/docker/artifacts-image.tar \ + docker build . \ + --file=docker/experimental/Dockerfile \ + --tag=${_ARTIFACTS_IMAGE_URL} \ --target=artifacts \ $(cat /docker/flags.txt) timeout: 14400s volumes: - name: docker path: /docker -- id: 'import-artifacts' - name: gcr.io/cloud-builders/docker +- id: 'push-artifacts' + name: 'gcr.io/cloud-builders/docker' args: - - load - - --input - - /docker/artifacts-image.tar - volumes: - - name: docker - path: /docker + - push + - ${_ARTIFACTS_IMAGE_URL} - id: 'copy-wheels' + waitFor: ['build-wheels'] name: ${_ARTIFACTS_IMAGE_URL} entrypoint: bash args: @@ -67,6 +60,7 @@ steps: - name: 'wheels' path: /wheels - id: 'install-twine' + waitFor: ['build-wheels'] name: python entrypoint: pip args: @@ -90,21 +84,26 @@ steps: path: /wheels - id: 'release-image' waitFor: ['build-wheels'] - name: 'gcr.io/kaniko-project/executor:debug' + name: 'gcr.io/cloud-builders/docker' entrypoint: sh args: - -cx - | - /kaniko/executor \ - --dockerfile=docker/experimental/Dockerfile \ - --destination=${_RELEASE_IMAGE_URL} \ + docker build . \ + --file=docker/experimental/Dockerfile \ + --tag=${_RELEASE_IMAGE_URL} \ $(cat /docker/flags.txt) timeout: 14400s volumes: - name: docker path: /docker +- id: 'push-release' + name: 'gcr.io/cloud-builders/docker' + args: + - push + - ${_RELEASE_IMAGE_URL} - id: 'tag-image' - waitFor: ['release-image'] + waitFor: ['push-artifacts', 'push-release'] name: 'google/cloud-sdk' entrypoint: bash args: @@ -129,8 +128,8 @@ substitutions: options: # To run in a test project, either point to your pool or replace this with # `machineType`. You may have to reduce _BAZEL_JOBS. - workerPool: projects/tpu-pytorch/locations/us-central1/wheel_build - # machineType: E2_HIGHCPU_32 + # workerPool: projects/tpu-pytorch/locations/us-central1/wheel_build + machineType: E2_HIGHCPU_32 dynamic_substitutions: true substitution_option: 'ALLOW_LOOSE' timeout: 24000s From 7b8d9ee484846dff8b7dd2e9bb90a810eb8bc931 Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Tue, 3 Jan 2023 18:37:15 +0000 Subject: [PATCH 8/9] Re-enable worker pool and increase bazel jobs --- docker/experimental/cloudbuild.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/experimental/cloudbuild.yaml b/docker/experimental/cloudbuild.yaml index 4b6a6ab47491..c98f8f4a23dc 100644 --- a/docker/experimental/cloudbuild.yaml +++ b/docker/experimental/cloudbuild.yaml @@ -121,15 +121,15 @@ substitutions: _RELEASE_IMAGE_URL: ${_IMAGE_REPOSITORY}/torch-xla:${_IMAGE_TAG} _ARTIFACTS_IMAGE_URL: ${_IMAGE_REPOSITORY}/artifacts:${_IMAGE_TAG} _CACHE_IMAGE_URL: ${_IMAGE_REPOSITORY}/cache - _BAZEL_JOBS: '16' + _BAZEL_JOBS: '32' _CACHE: 'true' _CACHE_TTL: '18h' _BUILD_ARGS: tpuvm=1,cuda=0 options: # To run in a test project, either point to your pool or replace this with # `machineType`. You may have to reduce _BAZEL_JOBS. - # workerPool: projects/tpu-pytorch/locations/us-central1/wheel_build - machineType: E2_HIGHCPU_32 + workerPool: projects/tpu-pytorch/locations/us-central1/wheel_build + # machineType: E2_HIGHCPU_32 dynamic_substitutions: true substitution_option: 'ALLOW_LOOSE' timeout: 24000s From a52a6c85a8ddc47f68e9659bc95775ba17b2fd6b Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Tue, 3 Jan 2023 18:42:30 +0000 Subject: [PATCH 9/9] Correct comment --- docker/experimental/terraform/modules/trigger/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/experimental/terraform/modules/trigger/main.tf b/docker/experimental/terraform/modules/trigger/main.tf index 717567e852c1..084317b204c6 100644 --- a/docker/experimental/terraform/modules/trigger/main.tf +++ b/docker/experimental/terraform/modules/trigger/main.tf @@ -49,7 +49,7 @@ resource "google_cloudbuild_trigger" "build-trigger" { filename = "docker/experimental/cloudbuild.yaml" dynamic "github" { - # HACK: `source_to_build` is mutually exclusive with `github` + # HACK: only add `github` section at all when building on push for_each = var.build_on_push ? [1] : [] content {