Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions docker/experimental/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ ARG tf_cuda_compute_capabilities="7.0,7.5,8.0"

ARG tpuvm=1
ARG build_cpp_tests=0
ARG package_version=1.14.0
ARG package_version=2.0.0

ARG bazel_jobs=

Expand Down Expand Up @@ -43,10 +43,13 @@ ENV CXX=clang++-8

RUN pip install mkl mkl-include setuptools typing_extensions cmake requests

RUN git clone --recursive --depth=1 https://github.com/pytorch/pytorch.git
RUN git clone --depth=1 https://github.com/pytorch/pytorch.git
WORKDIR /pytorch
COPY torch_patches/ torch_patches/
RUN bash -c "torch_pin=$(cat torch_patches/.torch_pin &); git fetch origin ${torch_pin:-master}; git checkout origin/${torch_pin:-master}"
RUN bash -c 'torch_pin=$(cat torch_patches/.torch_pin &); git fetch origin ${torch_pin:-master}; git checkout FETCH_HEAD'
RUN git submodule update --init --recursive
RUN find -wholename 'torch_patches/*.diff' | xargs -r patch -N -p1 -i


# Disable CUDA for PyTorch
ENV USE_CUDA "0"
Expand All @@ -68,8 +71,7 @@ FROM builder AS artifacts

COPY tf_patches/ tf_patches/
COPY third_party/ third_party/

RUN for p in tf_patches/*.diff; do patch -d third_party/tensorflow -N -p1 < $p; done
RUN find -wholename 'tf_patches/*.diff' | xargs -r patch -d third_party/tensorflow -N -p1 -i

COPY build_torch_xla_libs.sh .

Expand Down
48 changes: 25 additions & 23 deletions docker/experimental/cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ steps:

args=(${_BUILD_ARGS//,/ })
flags=(
--cache=${_CACHE}
--cache-ttl=${_CACHE_TTL}
--cache-repo=${_CACHE_IMAGE_URL}
--build-arg=bazel_jobs=${_BAZEL_JOBS}
--build-arg=python_version=${_PYTHON_VERSION}
# TODO: use current version in setup.py
Expand All @@ -31,31 +28,27 @@ steps:
- name: docker
path: /docker
- id: 'build-wheels'
name: 'gcr.io/kaniko-project/executor:debug'
name: 'gcr.io/cloud-builders/docker'
entrypoint: sh
args:
- -cx
- |
/kaniko/executor \
--dockerfile=docker/experimental/Dockerfile \
--destination=${_ARTIFACTS_IMAGE_URL} \
--tar-path=/docker/artifacts-image.tar \
docker build . \
--file=docker/experimental/Dockerfile \
--tag=${_ARTIFACTS_IMAGE_URL} \
--target=artifacts \
$(cat /docker/flags.txt)
timeout: 14400s
volumes:
- name: docker
path: /docker
- id: 'import-artifacts'
name: gcr.io/cloud-builders/docker
- id: 'push-artifacts'
name: 'gcr.io/cloud-builders/docker'
args:
- load
- --input
- /docker/artifacts-image.tar
volumes:
- name: docker
path: /docker
- push
- ${_ARTIFACTS_IMAGE_URL}
- id: 'copy-wheels'
waitFor: ['build-wheels']
name: ${_ARTIFACTS_IMAGE_URL}
entrypoint: bash
args:
Expand All @@ -67,6 +60,7 @@ steps:
- name: 'wheels'
path: /wheels
- id: 'install-twine'
waitFor: ['build-wheels']
name: python
entrypoint: pip
args:
Expand All @@ -90,21 +84,26 @@ steps:
path: /wheels
- id: 'release-image'
waitFor: ['build-wheels']
name: 'gcr.io/kaniko-project/executor:debug'
name: 'gcr.io/cloud-builders/docker'
entrypoint: sh
args:
- -cx
- |
/kaniko/executor \
--dockerfile=docker/experimental/Dockerfile \
--destination=${_RELEASE_IMAGE_URL} \
docker build . \
--file=docker/experimental/Dockerfile \
--tag=${_RELEASE_IMAGE_URL} \
$(cat /docker/flags.txt)
timeout: 14400s
volumes:
- name: docker
path: /docker
- id: 'push-release'
name: 'gcr.io/cloud-builders/docker'
args:
- push
- ${_RELEASE_IMAGE_URL}
- id: 'tag-image'
waitFor: ['release-image']
waitFor: ['push-artifacts', 'push-release']
name: 'google/cloud-sdk'
entrypoint: bash
args:
Expand All @@ -122,12 +121,15 @@ substitutions:
_RELEASE_IMAGE_URL: ${_IMAGE_REPOSITORY}/torch-xla:${_IMAGE_TAG}
_ARTIFACTS_IMAGE_URL: ${_IMAGE_REPOSITORY}/artifacts:${_IMAGE_TAG}
_CACHE_IMAGE_URL: ${_IMAGE_REPOSITORY}/cache
_BAZEL_JOBS: '16'
_BAZEL_JOBS: '32'
_CACHE: 'true'
_CACHE_TTL: '18h'
_BUILD_ARGS: tpuvm=1,cuda=0
options:
machineType: E2_HIGHCPU_32
# To run in a test project, either point to your pool or replace this with
# `machineType`. You may have to reduce _BAZEL_JOBS.
workerPool: projects/tpu-pytorch/locations/us-central1/wheel_build
# machineType: E2_HIGHCPU_32
dynamic_substitutions: true
substitution_option: 'ALLOW_LOOSE'
timeout: 24000s
40 changes: 40 additions & 0 deletions docker/experimental/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,37 @@ resource "google_artifact_registry_repository" "torch-xla-docker-repo" {
format = "DOCKER"
}

resource "google_cloudbuild_worker_pool" "gcb-pool" {
name = "wheel_build"
location = "us-central1"

worker_config {
disk_size_gb = 500
machine_type = "e2-standard-32"
no_external_ip = false
}
}

resource "google_service_account" "cloud-build-trigger-scheduler" {
account_id = "cloud-build-trigger-scheduler"
display_name = "Cloud Build Trigger Scheduler"
description = "Service account for running Cloud Build triggers in a Cloud Scheduler job"
}

resource "google_project_iam_member" "cloud-build-scheduler-permission" {
project = google_service_account.cloud-build-trigger-scheduler.project
role = "roles/cloudbuild.builds.editor"
member = "serviceAccount:${google_service_account.cloud-build-trigger-scheduler.email}"
}

module "nightly-py37-tpuvm" {
source = "./modules/trigger"

release = "nightly"
python_version = "3.7"
platform = "tpuvm"
docker_build_args = [ "tpuvm=1" ]
scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email
}

module "nightly-py38-tpuvm" {
Expand All @@ -53,6 +77,7 @@ module "nightly-py38-tpuvm" {
python_version = "3.8"
platform = "tpuvm"
docker_build_args = [ "tpuvm=1" ]
scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email
}

module "nightly-py38-tpunode" {
Expand All @@ -62,6 +87,7 @@ module "nightly-py38-tpunode" {
python_version = "3.8"
platform = "tpunode"
docker_build_args = [ "tpuvm=0" ]
scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email
}

module "nightly-py38-cuda112" {
Expand All @@ -71,4 +97,18 @@ module "nightly-py38-cuda112" {
python_version = "3.8"
platform = "cuda112"
docker_build_args = [ "tpuvm=0,cuda=1"]
scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email
}

module "r113-py37-tpuvm" {
source = "./modules/trigger"

release = "1.13"
branch = "wcromar/r1.13-kaggle"
build_on_push = true
schedule = null
python_version = "3.7"
platform = "tpuvm"
docker_build_args = [ "tpuvm=1" ]
scheduler_service_account = google_service_account.cloud-build-trigger-scheduler.email
}
60 changes: 45 additions & 15 deletions docker/experimental/terraform/modules/trigger/main.tf
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
data "google_project" "project" { }

variable "release" {
type = string
}

variable "branch" {
type = string
default = "master"
}

variable "python_version" {
description = "Python version to use (e.g. 3.8)"
type = string
Expand All @@ -16,51 +23,74 @@ variable "docker_build_args" {
default = [ "tpuvm=1" ]
}

variable "schedule" {
type = string
# Format: https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules
default = "0 0 * * *"
}

variable "scheduler_service_account" {
type = string
default = null
}

variable "build_on_push" {
type = string
default = false
}

locals {
trigger_name = format("pytorch-xla-%s-py%s-%s", var.release, replace(var.python_version, ".", ""), var.platform)
trigger_name = format("pytorch-xla-%s-py%s-%s", replace(var.release, ".", "-"), replace(var.python_version, ".", ""), var.platform)
}

resource "google_cloudbuild_trigger" "build-trigger" {
location = "global"
name = local.trigger_name
filename = "docker/experimental/cloudbuild.yaml"

source_to_build {
uri = "https://github.com/pytorch/xla"
repo_type = "GITHUB"
# TODO: make branch configurable
ref = "refs/heads/master"
dynamic "github" {
# HACK: only add `github` section at all when building on push
for_each = var.build_on_push ? [1] : []

content {
owner = "pytorch"
name = "xla"
push {
# `branch` is treated as a regex, so look for exact match
branch = "^${var.branch}$"
}
}
}

git_file_source {
path = "docker/experimental/cloudbuild.yaml"
repo_type = "GITHUB"
# TODO: make branch configurable
revision = "refs/heads/master"
source_to_build {
uri = "https://github.com/pytorch/xla"
repo_type = "GITHUB"
ref = "refs/heads/${var.branch}"
}

substitutions = {
_RELEASE_VERSION = var.release
_PLATFORM = var.platform
_BUILD_ARGS = join(",", var.docker_build_args)
_PYTHON_VERSION = var.python_version
}
}

resource "google_cloud_scheduler_job" "trigger-schedule" {
count = var.schedule != null ? 1 : 0

name = format("%s-schedule", local.trigger_name)
region = "us-central1"

# Format: https://cloud.google.com/scheduler/docs/configuring/cron-job-schedules
schedule = "0 0 * * *"
schedule = var.schedule
time_zone = "America/Los_Angeles"

http_target {
http_method = "POST"
uri = "https://cloudbuild.googleapis.com/v1/projects/${google_cloudbuild_trigger.build-trigger.project}/triggers/${google_cloudbuild_trigger.build-trigger.trigger_id}:run"

oauth_token {
# TODO: Include this SA in config
service_account_email = "cloud-build-trigger-scheduler@tpu-pytorch.iam.gserviceaccount.com"
service_account_email = var.scheduler_service_account
}
}
}