diff --git a/docker/experimental/Dockerfile b/docker/experimental/Dockerfile index c10286623ab5..80f32e7d8633 100644 --- a/docker/experimental/Dockerfile +++ b/docker/experimental/Dockerfile @@ -10,7 +10,7 @@ ARG tf_cuda_compute_capabilities="7.0,7.5,8.0" ARG tpuvm=1 ARG build_cpp_tests=0 -ARG package_version=1.14.0 +ARG package_version=2.0.0 ARG bazel_jobs= @@ -43,10 +43,13 @@ ENV CXX=clang++-8 RUN pip install mkl mkl-include setuptools typing_extensions cmake requests -RUN git clone --recursive --depth=1 https://github.com/pytorch/pytorch.git +RUN git clone --depth=1 https://github.com/pytorch/pytorch.git WORKDIR /pytorch COPY torch_patches/ torch_patches/ -RUN bash -c "torch_pin=$(cat torch_patches/.torch_pin &); git fetch origin ${torch_pin:-master}; git checkout origin/${torch_pin:-master}" +RUN bash -c 'torch_pin=$(cat torch_patches/.torch_pin &); git fetch origin ${torch_pin:-master}; git checkout FETCH_HEAD' +RUN git submodule update --init --recursive +RUN find -wholename 'torch_patches/*.diff' | xargs -r patch -N -p1 -i + # Disable CUDA for PyTorch ENV USE_CUDA "0" @@ -68,8 +71,7 @@ FROM builder AS artifacts COPY tf_patches/ tf_patches/ COPY third_party/ third_party/ - -RUN for p in tf_patches/*.diff; do patch -d third_party/tensorflow -N -p1 < $p; done +RUN find -wholename 'tf_patches/*.diff' | xargs -r patch -d third_party/tensorflow -N -p1 -i COPY build_torch_xla_libs.sh . diff --git a/docker/experimental/cloudbuild.yaml b/docker/experimental/cloudbuild.yaml index cfe8f4deba8e..c98f8f4a23dc 100644 --- a/docker/experimental/cloudbuild.yaml +++ b/docker/experimental/cloudbuild.yaml @@ -18,9 +18,6 @@ steps: args=(${_BUILD_ARGS//,/ }) flags=( - --cache=${_CACHE} - --cache-ttl=${_CACHE_TTL} - --cache-repo=${_CACHE_IMAGE_URL} --build-arg=bazel_jobs=${_BAZEL_JOBS} --build-arg=python_version=${_PYTHON_VERSION} # TODO: use current version in setup.py @@ -31,31 +28,27 @@ steps: - name: docker path: /docker - id: 'build-wheels' - name: 'gcr.io/kaniko-project/executor:debug' + name: 'gcr.io/cloud-builders/docker' entrypoint: sh args: - -cx - | - /kaniko/executor \ - --dockerfile=docker/experimental/Dockerfile \ - --destination=${_ARTIFACTS_IMAGE_URL} \ - --tar-path=/docker/artifacts-image.tar \ + docker build . \ + --file=docker/experimental/Dockerfile \ + --tag=${_ARTIFACTS_IMAGE_URL} \ --target=artifacts \ $(cat /docker/flags.txt) timeout: 14400s volumes: - name: docker path: /docker -- id: 'import-artifacts' - name: gcr.io/cloud-builders/docker +- id: 'push-artifacts' + name: 'gcr.io/cloud-builders/docker' args: - - load - - --input - - /docker/artifacts-image.tar - volumes: - - name: docker - path: /docker + - push + - ${_ARTIFACTS_IMAGE_URL} - id: 'copy-wheels' + waitFor: ['build-wheels'] name: ${_ARTIFACTS_IMAGE_URL} entrypoint: bash args: @@ -67,6 +60,7 @@ steps: - name: 'wheels' path: /wheels - id: 'install-twine' + waitFor: ['build-wheels'] name: python entrypoint: pip args: @@ -90,21 +84,26 @@ steps: path: /wheels - id: 'release-image' waitFor: ['build-wheels'] - name: 'gcr.io/kaniko-project/executor:debug' + name: 'gcr.io/cloud-builders/docker' entrypoint: sh args: - -cx - | - /kaniko/executor \ - --dockerfile=docker/experimental/Dockerfile \ - --destination=${_RELEASE_IMAGE_URL} \ + docker build . \ + --file=docker/experimental/Dockerfile \ + --tag=${_RELEASE_IMAGE_URL} \ $(cat /docker/flags.txt) timeout: 14400s volumes: - name: docker path: /docker +- id: 'push-release' + name: 'gcr.io/cloud-builders/docker' + args: + - push + - ${_RELEASE_IMAGE_URL} - id: 'tag-image' - waitFor: ['release-image'] + waitFor: ['push-artifacts', 'push-release'] name: 'google/cloud-sdk' entrypoint: bash args: @@ -122,12 +121,15 @@ substitutions: _RELEASE_IMAGE_URL: ${_IMAGE_REPOSITORY}/torch-xla:${_IMAGE_TAG} _ARTIFACTS_IMAGE_URL: ${_IMAGE_REPOSITORY}/artifacts:${_IMAGE_TAG} _CACHE_IMAGE_URL: ${_IMAGE_REPOSITORY}/cache - _BAZEL_JOBS: '16' + _BAZEL_JOBS: '32' _CACHE: 'true' _CACHE_TTL: '18h' _BUILD_ARGS: tpuvm=1,cuda=0 options: - machineType: E2_HIGHCPU_32 + # To run in a test project, either point to your pool or replace this with + # `machineType`. You may have to reduce _BAZEL_JOBS. + workerPool: projects/tpu-pytorch/locations/us-central1/wheel_build + # machineType: E2_HIGHCPU_32 dynamic_substitutions: true substitution_option: 'ALLOW_LOOSE' timeout: 24000s diff --git a/docker/experimental/terraform/main.tf b/docker/experimental/terraform/main.tf index 148c68868018..7d7db9345dba 100644 --- a/docker/experimental/terraform/main.tf +++ b/docker/experimental/terraform/main.tf @@ -37,6 +37,29 @@ resource "google_artifact_registry_repository" "torch-xla-docker-repo" { format = "DOCKER" } +resource "google_cloudbuild_worker_pool" "gcb-pool" { + name = "wheel_build" + location = "us-central1" + + worker_config { + disk_size_gb = 500 + machine_type = "e2-standard-32" + no_external_ip = false + } +} + +resource "google_service_account" "cloud-build-trigger-scheduler" { + account_id = "cloud-build-trigger-scheduler" + display_name = "Cloud Build Trigger Scheduler" + description = "Service account for running Cloud Build triggers in a Cloud Scheduler job" +} + +resource "google_project_iam_member" "cloud-build-scheduler-permission" { + project = google_service_account.cloud-build-trigger-scheduler.project + role = "roles/cloudbuild.builds.editor" + member = "serviceAccount:${google_service_account.cloud-build-trigger-scheduler.email}" +} + module "nightly-py37-tpuvm" { source = "./modules/trigger" @@ -44,6 +67,7 @@ module "nightly-py37-tpuvm" { python_version = "3.7" platform = "tpuvm" docker_build_args = [ "tpuvm=1" ] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email } module "nightly-py38-tpuvm" { @@ -53,6 +77,7 @@ module "nightly-py38-tpuvm" { python_version = "3.8" platform = "tpuvm" docker_build_args = [ "tpuvm=1" ] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email } module "nightly-py38-tpunode" { @@ -62,6 +87,7 @@ module "nightly-py38-tpunode" { python_version = "3.8" platform = "tpunode" docker_build_args = [ "tpuvm=0" ] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email } module "nightly-py38-cuda112" { @@ -71,4 +97,18 @@ module "nightly-py38-cuda112" { python_version = "3.8" platform = "cuda112" docker_build_args = [ "tpuvm=0,cuda=1"] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email +} + +module "r113-py37-tpuvm" { + source = "./modules/trigger" + + release = "1.13" + branch = "wcromar/r1.13-kaggle" + build_on_push = true + schedule = null + python_version = "3.7" + platform = "tpuvm" + docker_build_args = [ "tpuvm=1" ] + scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email } diff --git a/docker/experimental/terraform/modules/trigger/main.tf b/docker/experimental/terraform/modules/trigger/main.tf index 9b8b5dbf82ca..084317b204c6 100644 --- a/docker/experimental/terraform/modules/trigger/main.tf +++ b/docker/experimental/terraform/modules/trigger/main.tf @@ -1,7 +1,14 @@ +data "google_project" "project" { } + variable "release" { type = string } +variable "branch" { + type = string + default = "master" +} + variable "python_version" { description = "Python version to use (e.g. 3.8)" type = string @@ -16,30 +23,53 @@ variable "docker_build_args" { default = [ "tpuvm=1" ] } +variable "schedule" { + type = string + # Format: https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules + default = "0 0 * * *" +} + +variable "scheduler_service_account" { + type = string + default = null +} + +variable "build_on_push" { + type = string + default = false +} + locals { - trigger_name = format("pytorch-xla-%s-py%s-%s", var.release, replace(var.python_version, ".", ""), var.platform) + trigger_name = format("pytorch-xla-%s-py%s-%s", replace(var.release, ".", "-"), replace(var.python_version, ".", ""), var.platform) } resource "google_cloudbuild_trigger" "build-trigger" { location = "global" name = local.trigger_name + filename = "docker/experimental/cloudbuild.yaml" - source_to_build { - uri = "https://github.com/pytorch/xla" - repo_type = "GITHUB" - # TODO: make branch configurable - ref = "refs/heads/master" + dynamic "github" { + # HACK: only add `github` section at all when building on push + for_each = var.build_on_push ? [1] : [] + + content { + owner = "pytorch" + name = "xla" + push { + # `branch` is treated as a regex, so look for exact match + branch = "^${var.branch}$" + } + } } - git_file_source { - path = "docker/experimental/cloudbuild.yaml" - repo_type = "GITHUB" - # TODO: make branch configurable - revision = "refs/heads/master" + source_to_build { uri = "https://github.com/pytorch/xla" + repo_type = "GITHUB" + ref = "refs/heads/${var.branch}" } substitutions = { + _RELEASE_VERSION = var.release _PLATFORM = var.platform _BUILD_ARGS = join(",", var.docker_build_args) _PYTHON_VERSION = var.python_version @@ -47,11 +77,12 @@ resource "google_cloudbuild_trigger" "build-trigger" { } resource "google_cloud_scheduler_job" "trigger-schedule" { + count = var.schedule != null ? 1 : 0 + name = format("%s-schedule", local.trigger_name) region = "us-central1" - # Format: https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules - schedule = "0 0 * * *" + schedule = var.schedule time_zone = "America/Los_Angeles" http_target { @@ -59,8 +90,7 @@ resource "google_cloud_scheduler_job" "trigger-schedule" { uri = "https://cloudbuild.googleapis.com/v1/projects/${google_cloudbuild_trigger.build-trigger.project}/triggers/${google_cloudbuild_trigger.build-trigger.trigger_id}:run" oauth_token { - # TODO: Include this SA in config - service_account_email = "cloud-build-trigger-scheduler@tpu-pytorch.iam.gserviceaccount.com" + service_account_email = var.scheduler_service_account } } }