diff --git a/docker/experimental/ansible/Dockerfile b/docker/experimental/ansible/Dockerfile new file mode 100644 index 000000000000..741a6051124b --- /dev/null +++ b/docker/experimental/ansible/Dockerfile @@ -0,0 +1,56 @@ +ARG python_version=3.8 +ARG debian_version=buster + +FROM python:${python_version}-${debian_version} AS build + +RUN pip install ansible + +COPY . /ansible +WORKDIR /ansible + +ARG arch=amd64 +ARG accelerator=tpu +ARG cuda_version=11.8 +ARG pytorch_git_rev=HEAD +ARG xla_git_rev=HEAD +ARG package_version + +RUN ansible-playbook -vvv playbook.yaml -e \ + "stage=build \ + arch=${arch} \ + accelerator=${accelerator} \ + cuda_version=${cuda_version} \ + pytorch_git_rev=${pytorch_git_rev} \ + xla_git_rev=${xla_git_rev} \ + package_version=${package_version}" + +FROM python:${python_version}-${debian_version} AS release + +WORKDIR /ansible +COPY . /ansible + +ARG arch=amd64 +ARG accelerator=tpu +ARG cuda_version=11.8 +ARG pytorch_git_rev=HEAD +ARG xla_git_rev=HEAD + +RUN pip install ansible +RUN ansible-playbook -vvv playbook.yaml -e \ + "stage=release \ + arch=${arch} \ + accelerator=${accelerator} \ + cuda_version=${cuda_version} \ + pytorch_git_rev=${pytorch_git_rev} \ + xla_git_rev=${xla_git_rev} \ + " --tags "install_deps" + +WORKDIR /dist +COPY --from=build /src/pytorch/dist/*.whl . +COPY --from=build /src/pytorch/xla/dist/*.whl . + +RUN echo "Installing the following wheels" && ls /dist/*.whl +RUN pip install *.whl + +WORKDIR / +RUN rm -rf /ansible diff --git a/docker/experimental/ansible/ansible.cfg b/docker/experimental/ansible/ansible.cfg index cb7519265802..490c16aea505 100644 --- a/docker/experimental/ansible/ansible.cfg +++ b/docker/experimental/ansible/ansible.cfg @@ -7,6 +7,8 @@ callbacks_enabled = profile_tasks # The playbooks is only run on the implicit localhost. # Silence warning about empty hosts inventory. localhost_warning = False +# Make output human-readable. +stdout_callback = yaml [inventory] # Silence warning about no inventory. diff --git a/docker/experimental/ansible/config/apt.yaml b/docker/experimental/ansible/config/apt.yaml index 97ce1755f234..805bbf64db01 100644 --- a/docker/experimental/ansible/config/apt.yaml +++ b/docker/experimental/ansible/config/apt.yaml @@ -15,11 +15,11 @@ apt: - wget build_cuda: - - cuda-libraries-11-8 - - cuda-toolkit-11-8 - - cuda-minimal-build-11-8 - - libcudnn8=8.8.0.121-1+cuda11.8 - - libcudnn8-dev=8.8.0.121-1+cuda11.8 + - "cuda-libraries-{{ cuda_version | replace('.', '-') }}" + - "cuda-toolkit-{{ cuda_version | replace('.', '-') }}" + - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}" + - "{{ cuda_deps['libcudnn'][cuda_version] }}" + - "{{ cuda_deps['libcudnn-dev'][cuda_version] }}" build_amd64: - "clang-{{ clang_version }}" @@ -39,9 +39,9 @@ apt: - patch release_cuda: - - cuda-libraries-11-8 - - cuda-minimal-build-11-8 - - libcudnn8=8.8.0.121-1+cuda11.8 + - "cuda-libraries-{{ cuda_version | replace('.', '-') }}" + - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}" + - "{{ cuda_deps['libcudnn'][cuda_version] }}" # Specify objects with string fields `url` and `keyring`. # The keyring path should start with /usr/share/keyrings/ for debian and ubuntu. diff --git a/docker/experimental/ansible/config/cuda_deps.yaml b/docker/experimental/ansible/config/cuda_deps.yaml new file mode 100644 index 000000000000..d57f00eba20e --- /dev/null +++ b/docker/experimental/ansible/config/cuda_deps.yaml @@ -0,0 +1,7 @@ +# Versions of cuda dependencies for given cuda versions. +# Note: wrap version in quotes to ensure they're treated as strings. +cuda_deps: + libcudnn: + "11.8": libcudnn8=8.8.0.121-1+cuda11.8 + libcudnn-dev: + "11.8": libcudnn8-dev=8.8.0.121-1+cuda11.8 diff --git a/docker/experimental/ansible/config/env.yaml b/docker/experimental/ansible/config/env.yaml index ce1e53d004cd..01b12eb7ab4a 100644 --- a/docker/experimental/ansible/config/env.yaml +++ b/docker/experimental/ansible/config/env.yaml @@ -2,8 +2,11 @@ # They'll be accessible for all processes on the host. release_env: common: - CC: "clang-{{ clang_version }}" - CXX: "clang++-{{ clang_version }}" + # Force GCC because clang/bazel has issues. + CC: gcc + CXX: g++ + # CC: "clang-{{ clang_version }}" + # CXX: "clang++-{{ clang_version }}" LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib" tpu: @@ -20,8 +23,11 @@ build_env: LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib" # Set explicitly to 0 as setup.py defaults this flag to true if unset. BUILD_CPP_TESTS: 0 - CC: "clang-{{ clang_version }}" - CXX: "clang++-{{ clang_version }}" + # Force GCC because clang/bazel has issues. + CC: gcc + CXX: g++ + # CC: "clang-{{ clang_version }}" + # CXX: "clang++-{{ clang_version }}" PYTORCH_BUILD_NUMBER: 1 TORCH_XLA_VERSION: "{{ package_version }}" PYTORCH_BUILD_VERSION: "{{ package_version }}" diff --git a/docker/experimental/ansible/config/vars.yaml b/docker/experimental/ansible/config/vars.yaml index 4afb567aacb2..4fed33c9d7da 100644 --- a/docker/experimental/ansible/config/vars.yaml +++ b/docker/experimental/ansible/config/vars.yaml @@ -1,5 +1,6 @@ # Used for fetching cuda from the right repo, see apt.yaml. cuda_repo: ubuntu1804 +cuda_version: "11.8" # Used for fetching clang from the right repo, see apt.yaml. llvm_debian_repo: buster clang_version: 10 diff --git a/docker/experimental/ansible/development.Dockerfile b/docker/experimental/ansible/development.Dockerfile new file mode 100644 index 000000000000..239d40cfc278 --- /dev/null +++ b/docker/experimental/ansible/development.Dockerfile @@ -0,0 +1,18 @@ +# Dockerfile for building a development image. +# The built image contains all required pip and apt packages for building and +# running PyTorch and PyTorch/XLA. The image doesn't contain any source code. +ARG python_version=3.8 +ARG debian_version=buster + +FROM python:${python_version}-${debian_version} + +RUN pip install ansible + +COPY . /ansible +WORKDIR /ansible + +ARG arch=amd64 +ARG accelerator=tpu + +RUN ansible-playbook playbook.yaml -e "stage=build arch=${arch} accelerator=${accelerator}" --skip-tags "fetch_srcs,build_srcs" +RUN ansible-playbook playbook.yaml -e "stage=release arch=${arch} accelerator=${accelerator}" --skip-tags "fetch_srcs,build_srcs" diff --git a/docker/experimental/ansible/playbook.yaml b/docker/experimental/ansible/playbook.yaml index 7d89fc335383..2667604ff20b 100644 --- a/docker/experimental/ansible/playbook.yaml +++ b/docker/experimental/ansible/playbook.yaml @@ -12,7 +12,7 @@ ansible.builtin.assert: that: "{{ lookup('ansible.builtin.vars', item.name) is regex(item.pattern) }}" fail_msg: | - "Variable '{{ item.name }}' doesn't match pattern '{{ item.pattern }}'" + "Variable '{{ item.name }} = '{{ lookup('ansible.builtin.vars', item.name) }}' doesn't match pattern '{{ item.pattern }}'" "Pass the required variable with: --e \"{{ item.name }}=\"" loop: - name: stage @@ -28,12 +28,16 @@ loop: # vars.yaml should be the first as other config files depend on it. - vars.yaml + # cuda_deps should be loaded before apt, since apt depends on it. + - cuda_deps.yaml - apt.yaml - pip.yaml - env.yaml + tags: always # Execute this task even with `--skip-tags` is used. roles: - - bazel + - role: bazel + tags: bazel - role: install_deps vars: @@ -62,12 +66,12 @@ pip.pkgs_nodeps[stage + '_' + arch] | default([], true) + pip.pkgs_nodeps[stage + '_' + accelerator] | default([], true) }}" + tags: install_deps - role: fetch_srcs vars: src_root: "/src" - pytorch_git_rev: HEAD - xla_git_rev: HEAD + tags: fetch_srcs - role: build_srcs vars: @@ -77,6 +81,7 @@ combine(build_env[arch] | default({}, true)) | combine(build_env[accelerator] | default({}, true)) }}" + tags: build_srcs - role: configure_env vars: @@ -86,3 +91,4 @@ combine(release_env[accelerator] | default({}, true)) }}" when: stage == "release" + tags: configure_env diff --git a/docker/experimental/ansible/roles/fetch_srcs/tasks/main.yaml b/docker/experimental/ansible/roles/fetch_srcs/tasks/main.yaml index 929a2404ac65..75f6b1233901 100644 --- a/docker/experimental/ansible/roles/fetch_srcs/tasks/main.yaml +++ b/docker/experimental/ansible/roles/fetch_srcs/tasks/main.yaml @@ -34,6 +34,7 @@ # localhost. remote_src: true strip: 1 + ignore_whitespace: true basedir: "{{ (src_root, 'pytorch/xla/third_party/tensorflow') | path_join }}" loop: "{{ tf_patches.files | map(attribute='path') }}" ignore_errors: true diff --git a/docker/experimental/terraform/README.md b/docker/experimental/terraform/README.md index 29ab705b9c84..e86461a116c7 100644 --- a/docker/experimental/terraform/README.md +++ b/docker/experimental/terraform/README.md @@ -1,26 +1,17 @@ -# Terraform configuration for build/test resources +# Terraform for CloudBuild triggers -Download the latest Terraform binary for your system and add it to your `$PATH`: -https://developer.hashicorp.com/terraform/downloads +This Terraform setup provisions: +- public storage bucket for PyTorch and PyTorch/XLA wheels. +- private storage bucket for Terraform state. +- public artifact repository for docker images. +- cloud builds for nightly and release docker images and wheels. +- schedule jobs and a service account for triggering cloud build. -Terraform state is stored in a shared GCS bucket. To initialize Terraform, run -the following: +# Running -``` -# Authenticate with GCP -gcloud auth login --update-adc +1. Run `gcloud auth application-default login` on your local workstation. +2. Make sure that a recent Terraform binary is installed (>= 1.3.8). + If not, install Terraform from the [official source](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli). +3. Run `terraform apply -var-file=vars/staging.tfvars`. -# Initialize Terraform -terraform init -``` -To preview your changes run `terraform plan`. - -If the changes look correct, you can update the project with `terraform apply`. - -Resources: - -- Official Terraform documentation: https://developer.hashicorp.com/terraform/docs -- GCP Terraform documentation: https://cloud.google.com/docs/terraform/get-started-with-terraform -- Storing Terraform state in GCS: https://cloud.google.com/docs/terraform/resource-management/store-state -- Cloud Build Trigger documentation: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger diff --git a/docker/experimental/terraform/artifact_repo.tf b/docker/experimental/terraform/artifact_repo.tf new file mode 100644 index 000000000000..3777db47e2d6 --- /dev/null +++ b/docker/experimental/terraform/artifact_repo.tf @@ -0,0 +1,24 @@ +# Docker repository in Artifact Registry for all public images. +resource "google_artifact_registry_repository" "public_docker_repo" { + location = var.public_docker_repo.location + repository_id = var.public_docker_repo.id + description = "Official docker images." + format = "DOCKER" +} + +resource "google_artifact_registry_repository_iam_member" "all_users_read_public_docker_repo" { + role = "roles/artifactregistry.reader" + member = "allUsers" + project = google_artifact_registry_repository.public_docker_repo.project + location = google_artifact_registry_repository.public_docker_repo.location + repository = google_artifact_registry_repository.public_docker_repo.name +} + +locals { + public_repo = google_artifact_registry_repository.public_docker_repo + public_docker_repo_url = "${local.public_repo.location}-docker.pkg.dev/${var.project_id}/${local.public_repo.repository_id}" +} + +output "public_docker_registry_url" { + value = local.public_docker_repo_url +} diff --git a/docker/experimental/terraform/buckets.tf b/docker/experimental/terraform/buckets.tf new file mode 100644 index 000000000000..619489b92506 --- /dev/null +++ b/docker/experimental/terraform/buckets.tf @@ -0,0 +1,43 @@ +resource "google_storage_bucket" "tfstate" { + name = "${var.project_id}-tfstate${var.storage_bucket_suffix}" + force_destroy = false + location = "US" + storage_class = "STANDARD" + + # Required by project policy. + # See https://cloud.google.com/storage/docs/uniform-bucket-level-access. + uniform_bucket_level_access = false + + versioning { + enabled = true + } +} + +# Storage bucket for all publicly released wheels. +resource "google_storage_bucket" "public_wheels" { + name = "${var.project_id}-wheels-public" + force_destroy = false + location = "US" + storage_class = "STANDARD" + + uniform_bucket_level_access = false + + versioning { + enabled = true + } +} + +# Grants all users (public) read access to the bucket with wheels. +resource "google_storage_bucket_access_control" "all_users_read_public_wheels" { + bucket = google_storage_bucket.public_wheels.name + role = "READER" + entity = "allUsers" +} + +output "public_wheels_bucket_url" { + value = google_storage_bucket.public_wheels.url +} + +output "tfstate_bucket_url" { + value = google_storage_bucket.tfstate.url +} diff --git a/docker/experimental/terraform/cloudbuild.tf b/docker/experimental/terraform/cloudbuild.tf new file mode 100644 index 000000000000..4641cf529ad7 --- /dev/null +++ b/docker/experimental/terraform/cloudbuild.tf @@ -0,0 +1,144 @@ +# Detailed documentation on cloudbuild parameters: +# https://cloud.google.com/build/docs/api/reference/rest/v1/projects.builds#resource-build + +locals { + docker_images_map = { + for di in var.docker_images : + # Use either provided trigger name or image name and append triggers_suffix. + "${coalesce(di.trigger_name, replace(di.image, "_", "-"))}${var.triggers_suffix}" => di + } +} + +resource "google_cloudbuild_trigger" "docker_images" { + for_each = local.docker_images_map + + location = "global" + name = each.key + description = each.value.description + + dynamic "github" { + # Trigger on branch push only if there is any `trigger_file` file filter. + # To trigger on any push in the brunch set `trigger_files = ["**"]` + for_each = length(each.value.trigger_files) > 0 ? [1] : [] + + content { + owner = "pytorch" + name = "xla" + push { + # `branch` is a regex, so look for exact match. + branch = each.value.branch != "" ? "^${each.value.branch}$" : null + tag = each.value.git_tag != "" ? "^${each.value.git_tag}$" : null + } + } + } + + source_to_build { + uri = "https://github.com/pytorch/xla" + repo_type = "GITHUB" + ref = each.value.branch != "" ? "refs/heads/${each.value.branch}" : "refs/tags/${each.value.git_tag}" + } + + included_files = each.value.trigger_files + + build { + step { + id = "build_${each.value.image}" + entrypoint = "bash" + name = "gcr.io/cloud-builders/docker" + dir = each.value.dir + args = [ + "-c", + join(" ", + concat( + ["docker", "build", "--progress=plain"], + # Pass build args to the docker image. + [for arg_key, arg_val in each.value.build_args : "--build-arg=${arg_key}=${arg_val}"], + # Pass all specified tags as $(echo ). + # This allows to compute dynamic tags, e.g. date. + [for tag in each.value.image_tags : + "-t=\"${local.public_docker_repo_url}/${each.value.image}:$(echo ${tag})\"" + ], + # Image used for the `copy_wheels_to_volume` step. + ["-t=local_image"], + # Specify input docker file and context (current directory - each.value.dir) + ["-f=${each.value.dockerfile}", "."] + ) + ) + ] + } + + step { + id = "push_${each.value.image}" + entrypoint = "bash" + name = "gcr.io/cloud-builders/docker" + args = [ + "-c", "docker push --all-tags ${local.public_docker_repo_url}/${each.value.image}" + ] + } + + dynamic "step" { + for_each = each.value.wheels ? [1] : [] + + content { + # Copy wheels from the last built image to the shared volume. + id = "copy_wheels_to_volume" + name = "local_image" + entrypoint = "bash" + args = [ + "-c", join(" ", + ["echo The following wheels will be published &&", + "ls /dist/*.whl &&", + "cp /dist/*.whl /wheels", + ] + ) + ] + + volumes { + name = "wheels" + path = "/wheels" + } + } + } + + dynamic "step" { + for_each = each.value.wheels ? [1] : [] + + content { + # Upload copied images from the shared volume to the public storage bucket. + id = "upload_wheels_to_storage_bucket" + entrypoint = "bash" + name = "gcr.io/cloud-builders/gsutil" + args = [ + "-c", "gsutil cp /wheels/*.whl ${google_storage_bucket.public_wheels.url}", + ] + + volumes { + name = "wheels" + path = "/wheels" + } + } + } + + options { + substitution_option = "ALLOW_LOOSE" + dynamic_substitutions = true + worker_pool = local.worker_pool_id + } + + timeout = "${each.value.timeout_m * 60}s" + } + + include_build_logs = length(each.value.trigger_files) > 0 ? "INCLUDE_BUILD_LOGS_WITH_STATUS" : null +} + +# Add scheduled jobs for each cloudbuild_trigger with a schedule. +module "schedule_triggers" { + source = "./modules/trigger_schedule" + for_each = { + for name, di in local.docker_images_map : name => di if di.trigger_schedule != "" + } + + trigger = google_cloudbuild_trigger.docker_images[each.key] + schedule = each.value.trigger_schedule + scheduler_service_account = google_service_account.build_runner.email +} diff --git a/docker/experimental/terraform/main.tf b/docker/experimental/terraform/main.tf deleted file mode 100644 index be6e33c09834..000000000000 --- a/docker/experimental/terraform/main.tf +++ /dev/null @@ -1,119 +0,0 @@ -provider "google" { - project = "tpu-pytorch" -} - -resource "random_id" "bucket_prefix" { - byte_length = 8 -} - -resource "google_storage_bucket" "default" { - name = "${random_id.bucket_prefix.hex}-bucket-tfstate" - force_destroy = false - location = "US" - storage_class = "STANDARD" - versioning { - enabled = true - } -} - -terraform { - backend "gcs" { - bucket = "426a09baf5992b6a-bucket-tfstate" - prefix = "terraform/state" - } -} - -resource "google_artifact_registry_repository" "torch-xla-python-repo" { - location = "us" - repository_id = "torch-xla" - description = "PyTorch/XLA nightly packages" - format = "PYTHON" -} - -resource "google_artifact_registry_repository" "torch-xla-docker-repo" { - location = "us" - repository_id = "torch-xla-images" - description = "PyTorch/XLA nightly images" - format = "DOCKER" -} - -resource "google_cloudbuild_worker_pool" "gcb-pool" { - name = "wheel_build" - location = "us-central1" - - worker_config { - disk_size_gb = 500 - machine_type = "e2-standard-32" - no_external_ip = false - } -} - -resource "google_service_account" "cloud-build-trigger-scheduler" { - account_id = "cloud-build-trigger-scheduler" - display_name = "Cloud Build Trigger Scheduler" - description = "Service account for running Cloud Build triggers in a Cloud Scheduler job" -} - -resource "google_project_iam_member" "cloud-build-scheduler-permission" { - project = google_service_account.cloud-build-trigger-scheduler.project - role = "roles/cloudbuild.builds.editor" - member = "serviceAccount:${google_service_account.cloud-build-trigger-scheduler.email}" -} - -resource "google_cloudbuild_trigger" "tpu-test-trigger" { - location = "global" - name = "ci-tpu-test-trigger" - - github { - owner = "pytorch" - name = "xla" - push { - branch = "^master$" - } - } - - filename = "test/tpu/cloudbuild.yaml" -} - -module "nightly-py38-tpuvm" { - source = "./modules/trigger" - - release = "nightly" - python_version = "3.8" - platform = "tpuvm" - docker_build_args = [ "tpuvm=1" ] - scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email -} - -module "nightly-py38-tpunode" { - source = "./modules/trigger" - - release = "nightly" - python_version = "3.8" - platform = "tpunode" - docker_build_args = [ "tpuvm=0" ] - scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email -} - -module "nightly-py38-cuda112" { - source = "./modules/trigger" - - release = "nightly" - python_version = "3.8" - platform = "cuda112" - docker_build_args = [ "tpuvm=0,cuda=1"] - scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email -} - -module "r113-py37-tpuvm" { - source = "./modules/trigger" - - release = "1.13" - branch = "wcromar/r1.13-kaggle" - build_on_push = true - schedule = null - python_version = "3.7" - platform = "tpuvm" - docker_build_args = [ "tpuvm=1" ] - scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email -} diff --git a/docker/experimental/terraform/modules/trigger/main.tf b/docker/experimental/terraform/modules/trigger/main.tf deleted file mode 100644 index 084317b204c6..000000000000 --- a/docker/experimental/terraform/modules/trigger/main.tf +++ /dev/null @@ -1,96 +0,0 @@ -data "google_project" "project" { } - -variable "release" { - type = string -} - -variable "branch" { - type = string - default = "master" -} - -variable "python_version" { - description = "Python version to use (e.g. 3.8)" - type = string -} - -variable "platform" { - type = string -} - -variable "docker_build_args" { - type = list(string) - default = [ "tpuvm=1" ] -} - -variable "schedule" { - type = string - # Format: https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules - default = "0 0 * * *" -} - -variable "scheduler_service_account" { - type = string - default = null -} - -variable "build_on_push" { - type = string - default = false -} - -locals { - trigger_name = format("pytorch-xla-%s-py%s-%s", replace(var.release, ".", "-"), replace(var.python_version, ".", ""), var.platform) -} - -resource "google_cloudbuild_trigger" "build-trigger" { - location = "global" - name = local.trigger_name - filename = "docker/experimental/cloudbuild.yaml" - - dynamic "github" { - # HACK: only add `github` section at all when building on push - for_each = var.build_on_push ? [1] : [] - - content { - owner = "pytorch" - name = "xla" - push { - # `branch` is treated as a regex, so look for exact match - branch = "^${var.branch}$" - } - } - } - - source_to_build { - uri = "https://github.com/pytorch/xla" - repo_type = "GITHUB" - ref = "refs/heads/${var.branch}" - } - - substitutions = { - _RELEASE_VERSION = var.release - _PLATFORM = var.platform - _BUILD_ARGS = join(",", var.docker_build_args) - _PYTHON_VERSION = var.python_version - } -} - -resource "google_cloud_scheduler_job" "trigger-schedule" { - count = var.schedule != null ? 1 : 0 - - name = format("%s-schedule", local.trigger_name) - region = "us-central1" - - schedule = var.schedule - time_zone = "America/Los_Angeles" - - http_target { - http_method = "POST" - uri = "https://cloudbuild.googleapis.com/v1/projects/${google_cloudbuild_trigger.build-trigger.project}/triggers/${google_cloudbuild_trigger.build-trigger.trigger_id}:run" - - oauth_token { - service_account_email = var.scheduler_service_account - } - } -} diff --git a/docker/experimental/terraform/modules/trigger_schedule/job.tf b/docker/experimental/terraform/modules/trigger_schedule/job.tf new file mode 100644 index 000000000000..1d6717f0f242 --- /dev/null +++ b/docker/experimental/terraform/modules/trigger_schedule/job.tf @@ -0,0 +1,41 @@ +# Provides a scheduled job that will trigger Cloud Build periodically. + +variable "schedule" { + description = "Job schedule in cron format https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules" + type = string + default = "0 0 * * *" +} + +variable "trigger" { + description = "An instance of google_cloudbuild_trigger for which the schedule job should be configured." + type = object({ + project = string + trigger_id = string + name = string + }) +} + +variable "time_zone" { + description = "The schedule will be relative to this time zone." + default = "America/Los_Angeles" + type = string +} + +variable "scheduler_service_account" { + type = string +} + +resource "google_cloud_scheduler_job" "trigger-schedule" { + name = format("%s-schedule", var.trigger.name) + schedule = var.schedule + time_zone = "America/Los_Angeles" + + http_target { + http_method = "POST" + uri = "https://cloudbuild.googleapis.com/v1/projects/${var.trigger.project}/triggers/${var.trigger.trigger_id}:run" + + oauth_token { + service_account_email = var.scheduler_service_account + } + } +} diff --git a/docker/experimental/terraform/provider.tf b/docker/experimental/terraform/provider.tf new file mode 100644 index 000000000000..d987fc3b3630 --- /dev/null +++ b/docker/experimental/terraform/provider.tf @@ -0,0 +1,21 @@ +# Run `gcloud auth application-default login` in your shell before +provider "google" { + project = var.project_id + region = var.region +} + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "4.52.0" + } + } + + backend "gcs" { + # TODO: This has to be changed to match current project or passed as cli + # argument: "-backend-config="bucket=bucket_id" + bucket = "tpu-pytorch-tfstate-staging" + prefix = "terraform/state" + } +} diff --git a/docker/experimental/terraform/service_account.tf b/docker/experimental/terraform/service_account.tf new file mode 100644 index 000000000000..d44311bef7ea --- /dev/null +++ b/docker/experimental/terraform/service_account.tf @@ -0,0 +1,19 @@ +resource "google_service_account" "build_runner" { + project = var.project_id + account_id = "build-scheduler${var.build_runner_account_id_suffix}" +} + +# TODO: No permission to add it. +resource "google_project_iam_member" "build_runner_build_editor" { + project = var.project_id + role = "roles/cloudbuild.builds.editor" + member = "serviceAccount:${google_service_account.build_runner.email}" +} + +resource "google_project_iam_custom_role" "build_runner" { + project = var.project_id + role_id = "buildRunner" + title = "Build Runner" + description = "Grants permissions to trigger Cloud Builds." + permissions = ["cloudbuild.builds.create"] +} \ No newline at end of file diff --git a/docker/experimental/terraform/variables.tf b/docker/experimental/terraform/variables.tf new file mode 100644 index 000000000000..68d62af078f4 --- /dev/null +++ b/docker/experimental/terraform/variables.tf @@ -0,0 +1,101 @@ +variable "python_version" { + type = string + default = "3.8" + description = "Python version for all docker images." +} + +variable "public_docker_repo" { + type = object({ + id = string + location = optional(string, "us-central1") + }) +} + +variable "worker_pool" { + type = object({ + name = string + location = optional(string, "us-central1") + machine_type = optional(string, "e2-standard-32") + disk_size_gb = optional(number, 500) + }) +} + +variable "storage_bucket_suffix" { + type = string + default = "" +} + +variable "build_runner_account_id_suffix" { + type = string + default = "" +} + +variable "project_id" { + type = string + description = "ID of the GCP project." +} + +variable "region" { + type = string + default = "us-central1" +} + +variable "docker_images" { + type = list( + object({ + trigger_name = optional(string, "") + + # Name of the produced docker image (without tags). + image = string + + # Branch to fetch the Ansible setup at (not the XLA source code!) + branch = optional(string, "master") + + # Dockerfile path withing docker context (`dir` parameter). + dockerfile = optional(string, "Dockerfile") + + # Git tag to fetch the Ansible setup at (not the XLA source code!) + # Contents of the `dir` directory will be fetched at this tag. + git_tag = optional(string, "") + + # Cloud Build trigger description (for human consumption). + description = optional(string, "") + + # Trigger build only if any of the following was modified in the specified + # `branch` or `tag`. + trigger_files = optional(list(string), []) + + # Trigger build on the specified cron schedule. + trigger_schedule = optional(string, "") + + # Base directory for docker context. + dir = optional(string, "docker/experimental/ansible") + + # Build args to pass to the dockerfile (`ARG build_arg=`). + build_args = optional(map(any), {}) + + # Tags for the produced docker image. + # Can include bash expression e.g. "my_tag_$(date +%Y%m%d)". + image_tags = optional(list(string), []) + + # Set to true, if any *.whl files from /dist should be uploaded to + # the public storage bucket. + wheels = optional(bool, false) + + # Build job timeout. + timeout_m = optional(number, 30) + }) + ) + + validation { + condition = alltrue([ + for di in var.docker_images : (di.branch == "") != (di.git_tag == "") + ]) + error_message = "Specify exactly one of `branch` or `git_tag` for each docker image." + } +} + +variable "triggers_suffix" { + type = string + default = "" +} diff --git a/docker/experimental/terraform/vars/staging.tfvars b/docker/experimental/terraform/vars/staging.tfvars new file mode 100644 index 000000000000..2a132799a617 --- /dev/null +++ b/docker/experimental/terraform/vars/staging.tfvars @@ -0,0 +1,140 @@ +# Each docker images is translated into a single build trigger. +docker_images = [ + { + image = "development" + description = "Build development image with TPU support." + dockerfile = "development.Dockerfile" + trigger_files = [ + "docker/experimental/ansible/**", + "docker/experimental/terraform_cloudbuild/**", + ] + build_args = { + python_version = "3.8" + arch = "amd64" + accelerator = "tpu" + } + image_tags = [ + "tpu_amd64", + "tpu_amd64_$(date +%Y%m%d)", + ] + }, + { + trigger_name = "xla-nightly-38-cuda11-8" + image = "xla" + description = "Build nightly image with CUDA support" + trigger_schedule = "0 0 * * *" + build_args = { + python_version = "3.8" + arch = "amd64" + accelerator = "cuda" + cuda_version = "11.8" + package_version = "2.0" + } + image_tags = [ + "nightly_3.8_cuda11.8", + "nightly_3.8_cuda11.8_$(date +%Y%m%d)", + ] + wheels = true + timeout_m = 60 * 6 + }, + { + trigger_name = "xla-nightly-38-tpu" + image = "xla" + description = "Build nightly image with TPU support" + trigger_schedule = "0 0 * * *" + build_args = { + python_version = "3.8" + arch = "amd64" + accelerator = "tpu" + package_version = "2.0" + } + image_tags = [ + "nightly_3.8_tpuvm", + "nightly_3.8_$(date +%Y%m%d)", + ] + wheels = true + timeout_m = 60 * 6 + }, + { + trigger_name = "xla-2-0-38-tpu" + image = "xla" + description = "Build v2.0.0 image with TPU support" + # Don't use this tag here, since the repositiory at version v2.0.0 + # doesn't contain ansible setup. Instead, fetch PyTorch and XLAs sources at + # the desired tag. + # git_tag = "..." + trigger_schedule = "0 0 * * *" + build_args = { + python_version = "3.8" + arch = "amd64" + accelerator = "tpu" + # Fetch sources at the desired tag. + pytorch_git_rev = "v2.0.0" + xla_git_rev = "v2.0.0" + package_version = "2.0" + } + image_tags = [ + "r2.0_3.8_tpuvm", + ] + wheels = true + timeout_m = 60 * 6 + }, + { + trigger_name = "xla-1-13-38-tpu" + image = "xla" + description = "Build v1.13.0 image with TPU support" + # Don't use this tag here, since the repositiory at version v1.13.0 + # doesn't contain ansible setup. Instead, fetch PyTorch and XLAs sources at + # the desired tag. + # git_tag = "v1.13.0" + trigger_schedule = "0 0 * * *" + build_args = { + python_version = "3.8" + arch = "amd64" + accelerator = "tpu" + # Fetch sources at the desired tag. + pytorch_git_rev = "v1.13.0" + xla_git_rev = "v1.13.0" + package_version = "1.13" + } + image_tags = [ + "r1.13_3.8_tpuvm", + ] + wheels = true + timeout_m = 60 * 6 + }, + { + trigger_name = "xla-1-12-38-tpu" + image = "xla" + description = "Build v1.12.0 image with TPU support" + # git_tag = "v1.12.0" + trigger_schedule = "0 0 * * *" + build_args = { + python_version = "3.8" + arch = "amd64" + accelerator = "tpu" + pytorch_git_rev = "v1.12.0" + xla_git_rev = "v1.12.0" + package_version = "1.12" + } + image_tags = [ + "r1.12_3.8_tpuvm", + ] + wheels = true + timeout_m = 60 * 6 + }, +] + +# Variables for the staging environment. + +project_id = "tpu-pytorch" +public_docker_repo = { + id = "docker-public-staging" +} +worker_pool = { + name = "worker-pool-staging" + machine_type = "e2-standard-32" +} +storage_bucket_suffix = "-staging" +build_runner_account_id_suffix = "-staging" +triggers_suffix = "-staging" diff --git a/docker/experimental/terraform/worker_pool.tf b/docker/experimental/terraform/worker_pool.tf new file mode 100644 index 000000000000..ea1f22360ce1 --- /dev/null +++ b/docker/experimental/terraform/worker_pool.tf @@ -0,0 +1,18 @@ +resource "google_cloudbuild_worker_pool" "worker-pool" { + name = var.worker_pool.name + location = var.worker_pool.location + + worker_config { + disk_size_gb = var.worker_pool.disk_size_gb + machine_type = var.worker_pool.machine_type + no_external_ip = false + } +} + +locals { + worker_pool_id = google_cloudbuild_worker_pool.worker-pool.id +} + +output "worker_pool_id" { + value = local.worker_pool_id +}