-
Notifications
You must be signed in to change notification settings - Fork 559
Add Terraform config with CloudBuild triggers for building docker images and wheels #4604
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f224540
8b9f75e
e4b1d14
83f03a4
ae5bd04
9e20eff
23edf5f
962af4f
e42b986
12cdf8d
06f9555
756cc26
524cfb6
aa28b5a
9dafb22
eb5e81d
02dd57b
fd97bb0
267672f
4e3676d
dd8ecec
ca16d26
86b6b6f
c79b451
ae83150
a443d13
44ca33a
2a3170f
8097849
4b3eb79
95938b0
3777a71
0cb6c8b
8e20adb
4aa8fb5
c34fd6c
94fd434
6a99df9
ecf5578
7a57a1f
14aa5a8
fb085f4
3a4fefe
b6e70b1
075952c
cf01e1e
0321c16
0ff3559
7f61c1e
8af004a
b466327
6c4c79e
0bfe8b8
bb6435e
c69a03d
7da3dfa
b30aba2
afa66cd
9cfa91c
5d4ab4c
d269fa6
f09ec16
3b65e24
c8036d7
a5ff1be
4d7891d
a4ab9dc
2c3ecaf
e6e292e
1b791af
88ae574
c82b626
347c7b1
c97dc33
5eb4557
51614c9
d5c51e8
0e69044
1fabfd2
d46f9d0
b1cdde5
32ae9df
6142734
f2375a2
e0e2640
2207f85
7fe8115
e525c78
6ce0146
b001c64
059882d
e413012
5e71a9f
4839de0
c60ad3b
b203140
1a1d900
5498533
f97b74e
026af0c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| ARG python_version=3.8 | ||
| ARG debian_version=buster | ||
|
|
||
| FROM python:${python_version}-${debian_version} AS build | ||
|
|
||
| RUN pip install ansible | ||
|
|
||
| COPY . /ansible | ||
| WORKDIR /ansible | ||
|
|
||
| ARG arch=amd64 | ||
| ARG accelerator=tpu | ||
| ARG cuda_version=11.8 | ||
| ARG pytorch_git_rev=HEAD | ||
| ARG xla_git_rev=HEAD | ||
| ARG package_version | ||
|
|
||
| RUN ansible-playbook -vvv playbook.yaml -e \ | ||
| "stage=build \ | ||
| arch=${arch} \ | ||
| accelerator=${accelerator} \ | ||
| cuda_version=${cuda_version} \ | ||
| pytorch_git_rev=${pytorch_git_rev} \ | ||
| xla_git_rev=${xla_git_rev} \ | ||
| package_version=${package_version}" | ||
|
|
||
| FROM python:${python_version}-${debian_version} AS release | ||
|
|
||
| WORKDIR /ansible | ||
| COPY . /ansible | ||
|
|
||
| ARG arch=amd64 | ||
| ARG accelerator=tpu | ||
| ARG cuda_version=11.8 | ||
| ARG pytorch_git_rev=HEAD | ||
| ARG xla_git_rev=HEAD | ||
|
|
||
| RUN pip install ansible | ||
| RUN ansible-playbook -vvv playbook.yaml -e \ | ||
| "stage=release \ | ||
| arch=${arch} \ | ||
| accelerator=${accelerator} \ | ||
| cuda_version=${cuda_version} \ | ||
| pytorch_git_rev=${pytorch_git_rev} \ | ||
| xla_git_rev=${xla_git_rev} \ | ||
| " --tags "install_deps" | ||
|
|
||
| WORKDIR /dist | ||
| COPY --from=build /src/pytorch/dist/*.whl . | ||
| COPY --from=build /src/pytorch/xla/dist/*.whl . | ||
|
|
||
| RUN echo "Installing the following wheels" && ls /dist/*.whl | ||
| RUN pip install *.whl | ||
|
|
||
| WORKDIR / | ||
| RUN rm -rf /ansible | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| # Versions of cuda dependencies for given cuda versions. | ||
| # Note: wrap version in quotes to ensure they're treated as strings. | ||
| cuda_deps: | ||
| libcudnn: | ||
| "11.8": libcudnn8=8.8.0.121-1+cuda11.8 | ||
| libcudnn-dev: | ||
| "11.8": libcudnn8-dev=8.8.0.121-1+cuda11.8 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| # Used for fetching cuda from the right repo, see apt.yaml. | ||
| cuda_repo: ubuntu1804 | ||
| cuda_version: "11.8" | ||
| # Used for fetching clang from the right repo, see apt.yaml. | ||
| llvm_debian_repo: buster | ||
| clang_version: 10 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems that we're going for gcc in the end until clang/bazel interaction is fixed. We should keep clang around though for other things so nothing to change here, just fyi. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ack |
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| # Dockerfile for building a development image. | ||
| # The built image contains all required pip and apt packages for building and | ||
| # running PyTorch and PyTorch/XLA. The image doesn't contain any source code. | ||
| ARG python_version=3.8 | ||
| ARG debian_version=buster | ||
|
|
||
| FROM python:${python_version}-${debian_version} | ||
|
|
||
| RUN pip install ansible | ||
|
|
||
| COPY . /ansible | ||
| WORKDIR /ansible | ||
|
|
||
| ARG arch=amd64 | ||
| ARG accelerator=tpu | ||
|
|
||
| RUN ansible-playbook playbook.yaml -e "stage=build arch=${arch} accelerator=${accelerator}" --skip-tags "fetch_srcs,build_srcs" | ||
| RUN ansible-playbook playbook.yaml -e "stage=release arch=${arch} accelerator=${accelerator}" --skip-tags "fetch_srcs,build_srcs" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,7 @@ | |
| # localhost. | ||
| remote_src: true | ||
| strip: 1 | ||
| ignore_whitespace: true | ||
| basedir: "{{ (src_root, 'pytorch/xla/third_party/tensorflow') | path_join }}" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is now gone, and bazel does the patching. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But it's not gone for v1.12 and v.1.13 releases? The problem with this approach is that build & deployment configuration is not bound with sources version (and for now I don't think we can improve on that). So this build process has to work for current and future versions of the code as well as older ones. Do you think that having this step while bazel is doing the patching is problematic? I added |
||
| loop: "{{ tf_patches.files | map(attribute='path') }}" | ||
| ignore_errors: true | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,26 +1,17 @@ | ||
| # Terraform configuration for build/test resources | ||
| # Terraform for CloudBuild triggers | ||
|
|
||
| Download the latest Terraform binary for your system and add it to your `$PATH`: | ||
| https://developer.hashicorp.com/terraform/downloads | ||
| This Terraform setup provisions: | ||
| - public storage bucket for PyTorch and PyTorch/XLA wheels. | ||
| - private storage bucket for Terraform state. | ||
| - public artifact repository for docker images. | ||
| - cloud builds for nightly and release docker images and wheels. | ||
| - schedule jobs and a service account for triggering cloud build. | ||
|
|
||
| Terraform state is stored in a shared GCS bucket. To initialize Terraform, run | ||
| the following: | ||
| # Running | ||
|
|
||
| ``` | ||
| # Authenticate with GCP | ||
| gcloud auth login --update-adc | ||
| 1. Run `gcloud auth application-default login` on your local workstation. | ||
| 2. Make sure that a recent Terraform binary is installed (>= 1.3.8). | ||
| If not, install Terraform from the [official source](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli). | ||
| 3. Run `terraform apply -var-file=vars/staging.tfvars`. | ||
|
|
||
| # Initialize Terraform | ||
| terraform init | ||
| ``` | ||
|
|
||
| To preview your changes run `terraform plan`. | ||
|
|
||
| If the changes look correct, you can update the project with `terraform apply`. | ||
|
|
||
| Resources: | ||
|
|
||
| - Official Terraform documentation: https://developer.hashicorp.com/terraform/docs | ||
| - GCP Terraform documentation: https://cloud.google.com/docs/terraform/get-started-with-terraform | ||
| - Storing Terraform state in GCS: https://cloud.google.com/docs/terraform/resource-management/store-state | ||
| - Cloud Build Trigger documentation: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| # Docker repository in Artifact Registry for all public images. | ||
| resource "google_artifact_registry_repository" "public_docker_repo" { | ||
| location = var.public_docker_repo.location | ||
| repository_id = var.public_docker_repo.id | ||
| description = "Official docker images." | ||
| format = "DOCKER" | ||
| } | ||
|
|
||
| resource "google_artifact_registry_repository_iam_member" "all_users_read_public_docker_repo" { | ||
| role = "roles/artifactregistry.reader" | ||
| member = "allUsers" | ||
| project = google_artifact_registry_repository.public_docker_repo.project | ||
| location = google_artifact_registry_repository.public_docker_repo.location | ||
| repository = google_artifact_registry_repository.public_docker_repo.name | ||
| } | ||
|
|
||
| locals { | ||
| public_repo = google_artifact_registry_repository.public_docker_repo | ||
| public_docker_repo_url = "${local.public_repo.location}-docker.pkg.dev/${var.project_id}/${local.public_repo.repository_id}" | ||
| } | ||
|
|
||
| output "public_docker_registry_url" { | ||
| value = local.public_docker_repo_url | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| resource "google_storage_bucket" "tfstate" { | ||
| name = "${var.project_id}-tfstate${var.storage_bucket_suffix}" | ||
| force_destroy = false | ||
| location = "US" | ||
| storage_class = "STANDARD" | ||
|
|
||
| # Required by project policy. | ||
| # See https://cloud.google.com/storage/docs/uniform-bucket-level-access. | ||
| uniform_bucket_level_access = false | ||
|
|
||
| versioning { | ||
| enabled = true | ||
| } | ||
| } | ||
|
|
||
| # Storage bucket for all publicly released wheels. | ||
| resource "google_storage_bucket" "public_wheels" { | ||
| name = "${var.project_id}-wheels-public" | ||
| force_destroy = false | ||
| location = "US" | ||
| storage_class = "STANDARD" | ||
|
|
||
| uniform_bucket_level_access = false | ||
|
|
||
| versioning { | ||
| enabled = true | ||
| } | ||
| } | ||
|
|
||
| # Grants all users (public) read access to the bucket with wheels. | ||
| resource "google_storage_bucket_access_control" "all_users_read_public_wheels" { | ||
| bucket = google_storage_bucket.public_wheels.name | ||
| role = "READER" | ||
| entity = "allUsers" | ||
| } | ||
|
|
||
| output "public_wheels_bucket_url" { | ||
| value = google_storage_bucket.public_wheels.url | ||
| } | ||
|
|
||
| output "tfstate_bucket_url" { | ||
| value = google_storage_bucket.tfstate.url | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I love how simple the Dockerfile becomes with ansible!